[llvm-branch-commits] [llvm] Use register pair for PC spill (PR #169098)

Scott Linder via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Nov 24 10:18:30 PST 2025


https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/169098

>From b46525ba7f2e122695a399d6a8dd049ae3295cef Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder at amd.com>
Date: Wed, 29 Oct 2025 18:46:12 +0000
Subject: [PATCH] Use register pair for PC spill

---
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |   20 +
 .../CodeGen/AMDGPU/GlobalISel/assert-align.ll |    2 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |    8 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    2 +-
 .../abi-attribute-hints-undefined-behavior.ll |    2 +-
 .../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll  |    8 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |   44 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  280 +--
 ...tor-flatscratchinit-undefined-behavior2.ll |   13 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  166 +-
 .../test/CodeGen/AMDGPU/branch-relax-spill.ll |  156 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   40 +-
 .../AMDGPU/call-graph-register-usage.ll       |    2 +-
 .../AMDGPU/call-preserved-registers.ll        |  116 +-
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll |  106 +-
 .../callee-special-input-vgprs-packed.ll      |   14 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |   14 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/debug-frame.ll       |    8 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   64 +-
 .../dynamic-vgpr-reserve-stack-for-cwsr.ll    |    4 +-
 .../fix-frame-reg-in-custom-csr-spills.ll     |    2 +-
 ...frame-setup-without-sgpr-to-vgpr-spills.ll |   25 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |    8 +-
 .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll   |  144 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 1234 +++++++------
 .../gfx-callable-preserved-registers.ll       |   72 +-
 llvm/test/CodeGen/AMDGPU/global-alias.ll      |    2 +-
 .../identical-subrange-spill-infloop.ll       |   92 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     | 1104 +++++------
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    2 +-
 .../CodeGen/AMDGPU/insert-waitcnts-crash.ll   |   12 +-
 .../AMDGPU/materialize-frame-index-sgpr.ll    | 1634 +++++++++--------
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   20 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |    6 +-
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |    4 +-
 .../AMDGPU/no-source-locations-in-prologue.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |    6 +-
 .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir |  190 +-
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   27 +-
 .../AMDGPU/shufflevector.v2i64.v8i64.ll       |  299 +--
 .../si-lower-sgpr-spills-vgpr-lanes-usage.mir |   18 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  222 +--
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |    2 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |   10 +-
 .../AMDGPU/strictfp_f16_abi_promote.ll        |   14 +-
 .../CodeGen/AMDGPU/swdev504645-global-fold.ll |    3 +-
 .../AMDGPU/tail-call-inreg-arguments.error.ll |    4 +-
 ...unfold-masked-merge-scalar-variablemask.ll |   38 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |  168 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |    4 +-
 .../CodeGen/AMDGPU/whole-wave-functions.ll    |   20 +-
 .../AMDGPU/whole-wave-register-copy.ll        |    2 +-
 .../AMDGPU/whole-wave-register-spill.ll       |    4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |    8 +-
 56 files changed, 3262 insertions(+), 3229 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index cbd08f0fb5dff..5161a75097aeb 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -267,11 +267,19 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
 
     std::vector<CalleeSavedInfo> CSI;
     const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+    Register RetAddrReg = TRI->getReturnAddressReg(MF);
+    bool SpillRetAddrReg = false;
 
     for (unsigned I = 0; CSRegs[I]; ++I) {
       MCRegister Reg = CSRegs[I];
 
       if (SavedRegs.test(Reg)) {
+        if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) ||
+            Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) {
+          SpillRetAddrReg = true;
+          continue;
+        }
+
         const TargetRegisterClass *RC =
           TRI->getMinimalPhysRegClass(Reg, MVT::i32);
         int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
@@ -282,6 +290,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
       }
     }
 
+    // Return address uses a register pair. Add the super register to the
+    // CSI list so that it's easier to identify the entire spill and CFI
+    // can be emitted appropriately.
+    if (SpillRetAddrReg) {
+      const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+      int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+                                         TRI->getSpillAlign(*RC), true);
+      CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+      CalleeSavedFIs.push_back(JunkFI);
+    }
+
     if (!CSI.empty()) {
       for (MachineBasicBlock *SaveBlock : SaveBlocks)
         insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index b84b31cd2702c..023398377de94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7e6f500181ec6..2c1beb8468576 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
 ; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 72766f47030cc..35591cd602992 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], 0
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..0e24430e7be20 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    s_mov_b32 s32, s33
 ; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 2
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index 149b0cb4e052d..b6e65c8842904 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; DAGISEL-NEXT:    s_clause 0x1
 ; DAGISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; DAGISEL-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; GISEL-NEXT:    s_wait_alu 0xfffe
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 6fae7fdbbf9bb..1a6e0e27812fe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -16525,18 +16525,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v8, s30, 0
-; SI-NEXT:    v_writelane_b32 v8, s31, 1
-; SI-NEXT:    v_writelane_b32 v8, s34, 2
-; SI-NEXT:    v_writelane_b32 v8, s35, 3
-; SI-NEXT:    v_writelane_b32 v8, s36, 4
-; SI-NEXT:    v_writelane_b32 v8, s37, 5
-; SI-NEXT:    v_writelane_b32 v8, s38, 6
-; SI-NEXT:    v_writelane_b32 v8, s39, 7
-; SI-NEXT:    v_writelane_b32 v8, s48, 8
-; SI-NEXT:    v_writelane_b32 v8, s49, 9
+; SI-NEXT:    v_writelane_b32 v8, s34, 0
+; SI-NEXT:    v_writelane_b32 v8, s35, 1
+; SI-NEXT:    v_writelane_b32 v8, s36, 2
+; SI-NEXT:    v_writelane_b32 v8, s37, 3
+; SI-NEXT:    v_writelane_b32 v8, s38, 4
+; SI-NEXT:    v_writelane_b32 v8, s39, 5
+; SI-NEXT:    v_writelane_b32 v8, s48, 6
+; SI-NEXT:    v_writelane_b32 v8, s49, 7
+; SI-NEXT:    v_writelane_b32 v8, s50, 8
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT:    v_writelane_b32 v8, s50, 10
+; SI-NEXT:    v_writelane_b32 v8, s30, 9
 ; SI-NEXT:    v_readfirstlane_b32 s39, v6
 ; SI-NEXT:    v_readfirstlane_b32 s48, v5
 ; SI-NEXT:    v_readfirstlane_b32 s49, v4
@@ -16544,6 +16543,7 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    v_readfirstlane_b32 s35, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_readfirstlane_b32 s38, v1
+; SI-NEXT:    v_writelane_b32 v8, s31, 10
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -16815,18 +16815,18 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v8, 9
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s50, v8, 10
-; SI-NEXT:    v_readlane_b32 s49, v8, 9
-; SI-NEXT:    v_readlane_b32 s48, v8, 8
-; SI-NEXT:    v_readlane_b32 s39, v8, 7
-; SI-NEXT:    v_readlane_b32 s38, v8, 6
-; SI-NEXT:    v_readlane_b32 s37, v8, 5
-; SI-NEXT:    v_readlane_b32 s36, v8, 4
-; SI-NEXT:    v_readlane_b32 s35, v8, 3
-; SI-NEXT:    v_readlane_b32 s34, v8, 2
-; SI-NEXT:    v_readlane_b32 s31, v8, 1
-; SI-NEXT:    v_readlane_b32 s30, v8, 0
+; SI-NEXT:    v_readlane_b32 s31, v8, 10
+; SI-NEXT:    v_readlane_b32 s50, v8, 8
+; SI-NEXT:    v_readlane_b32 s49, v8, 7
+; SI-NEXT:    v_readlane_b32 s48, v8, 6
+; SI-NEXT:    v_readlane_b32 s39, v8, 5
+; SI-NEXT:    v_readlane_b32 s38, v8, 4
+; SI-NEXT:    v_readlane_b32 s37, v8, 3
+; SI-NEXT:    v_readlane_b32 s36, v8, 2
+; SI-NEXT:    v_readlane_b32 s35, v8, 1
+; SI-NEXT:    v_readlane_b32 s34, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 0329f23ea434f..6bbfcc9f52d95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -118,32 +118,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -177,21 +177,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -258,30 +258,30 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v2
@@ -313,20 +313,20 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -400,32 +400,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -459,21 +459,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -542,30 +542,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v42, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v42, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v42, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v42, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v42, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v42, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v42, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -596,20 +596,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v42, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v42, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v42, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v42, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v42, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v42, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -683,32 +683,32 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -741,21 +741,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 583b6fe0a81ca..3bed751c979c3 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX8-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX8-ARCH-FLAT-NEXT:    s_add_i32 s3, s33, 8
@@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX9-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX9-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX942-ARCH-FLAT-NEXT:    s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
 ; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT:    s_nop 1
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX942-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ab9cd8e037734..7d1e47ab8acb0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4395,8 +4395,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4424,10 +4424,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4453,10 +4453,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4482,10 +4482,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4508,13 +4508,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_short v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -4541,10 +4542,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4571,10 +4572,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -4601,10 +4603,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b16 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4648,8 +4651,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4679,13 +4682,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4711,10 +4714,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4740,10 +4743,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4766,13 +4769,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_dword v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -4799,10 +4803,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4829,10 +4833,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -4859,10 +4864,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b32 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4908,8 +4914,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4942,12 +4948,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4974,12 +4980,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5005,12 +5011,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5033,16 +5039,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_short v4, v1, off offset:4 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dword v4, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5069,12 +5076,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5101,12 +5108,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -5134,12 +5142,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 v4, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -5193,8 +5202,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v8, 0
+; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5234,13 +5243,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
+; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5267,12 +5276,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5298,12 +5307,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5326,14 +5335,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5360,12 +5370,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5392,10 +5402,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -5423,10 +5434,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -5500,8 +5512,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v16, 0
+; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5561,13 +5573,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
+; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5600,12 +5612,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5631,6 +5643,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -5640,7 +5653,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5663,13 +5675,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5696,6 +5709,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -5705,7 +5719,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5732,10 +5745,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
@@ -5762,10 +5776,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -5879,8 +5894,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v20, 0
+; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5980,13 +5995,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
+; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6031,12 +6046,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6062,6 +6077,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -6079,7 +6095,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6102,15 +6117,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
@@ -6137,6 +6153,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -6154,7 +6171,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6181,12 +6197,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
@@ -6213,12 +6230,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -48879,34 +48897,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
-; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
@@ -49032,6 +49050,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT:    v_readlane_b32 s30, v34, 6
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -49040,14 +49059,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
-; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
-; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
-; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
-; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v34, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v34, 5
+; GFX8-NEXT:    v_readlane_b32 s38, v34, 4
+; GFX8-NEXT:    v_readlane_b32 s37, v34, 3
+; GFX8-NEXT:    v_readlane_b32 s36, v34, 2
+; GFX8-NEXT:    v_readlane_b32 s35, v34, 1
+; GFX8-NEXT:    v_readlane_b32 s34, v34, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -49119,11 +49137,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX900-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT:    v_writelane_b32 v33, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v33, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v33, s34, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s34, 0
+; GFX900-NEXT:    v_writelane_b32 v33, s35, 1
+; GFX900-NEXT:    v_writelane_b32 v33, s30, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s31, 3
 ; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX900-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -49228,6 +49246,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_readlane_b32 s30, v33, 2
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v2, v5, s4
 ; GFX900-NEXT:    v_perm_b32 v2, v4, v7, s4
@@ -49244,10 +49263,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_perm_b32 v13, v26, v29, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v28, v32, s4
 ; GFX900-NEXT:    v_perm_b32 v15, v31, v30, s4
-; GFX900-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX900-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v33, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v33, 1
+; GFX900-NEXT:    v_readlane_b32 s34, v33, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index ab2ad19d0f1bf..fb11d3b7d9d65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s33, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s34, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s35, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s36, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s37, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s38, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s39, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s96, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s97, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s98, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s99, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s33, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s34, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s35, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s36, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s37, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s38, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s39, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s97, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s98, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s99, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s100, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s101, 36
 ; CHECK-NEXT:    s_mov_b32 s40, s12
-; CHECK-NEXT:    v_writelane_b32 v0, s100, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s30, 37
 ; CHECK-NEXT:    s_cmp_eq_u32 s40, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s101, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s31, 38
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s31
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s30, v0, 37
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s32
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use vcc_hi
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s101, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s100, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s99, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s98, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s97, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s96, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s39, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s38, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s37, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s36, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s35, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s34, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s33, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s101, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s100, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s99, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s98, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s97, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s39, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s38, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s37, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s36, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s35, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s34, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s33, v0, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index c407f7645315d..d86fbb9754c02 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7178,8 +7178,8 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 10
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7246,8 +7246,8 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 10
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7314,8 +7314,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7364,8 +7364,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7432,8 +7432,8 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 10
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7517,8 +7517,8 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7585,8 +7585,8 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7653,8 +7653,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7707,8 +7707,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7775,8 +7775,8 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7868,8 +7868,8 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 6
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7944,8 +7944,8 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 6
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8020,8 +8020,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8075,8 +8075,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8151,8 +8151,8 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 6
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8240,8 +8240,8 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8316,8 +8316,8 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8392,8 +8392,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8450,8 +8450,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8526,8 +8526,8 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index b250227735bd3..26727e53d990c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -25,8 +25,8 @@ define void @use_vcc() #1 {
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s4, v40, 2
 ; GCN: s_mov_b32 s33, s4
 ; GCN: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index aed1079158154..f9070339093da 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
@@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_mov_b32 s34, s40
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 1
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s34
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_mov_b32 s34, s40
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 1
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s34
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v41, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v41, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v41, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v41, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v41, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v41, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
@@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    ; use v40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    v_readlane_b32 s34, v41, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v41, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v41, 1
+; MUBUF-NEXT:    v_readlane_b32 s31, v41, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v41, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v41, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    ; use v40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 1
+; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v41, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index e7254eb5c3465..07f58df81c502 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 {
 ; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 3
-; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 5
-; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 6
-; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 7
-; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 9
-; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 10
-; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 11
-; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 13
-; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 14
-; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 15
-; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 17
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[16:31]
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 16
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[72:79]
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[0:15]
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 17
-; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 16
-; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 15
-; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 14
-; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 13
-; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 12
-; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 11
-; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 10
-; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 9
-; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 8
-; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 7
-; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 6
-; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 5
-; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 4
-; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved initial VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved initial VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
@@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
@@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 5f965ba431ab5..bb5963244da3c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bb2f06bfe83f8..f20be656f3af0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 9
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 38c20c7cf62d6..0cab17c9bfcfc 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 676144e65c10f..7d38cea998256 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX900-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX900-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-DIS-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-DIS-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-DIS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-DIS-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-EN-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-EN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-EN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-EN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 {
 ; WAVE32-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; WAVE32-NEXT:    v_writelane_b32 v40, s31, 1
 ; WAVE32-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    v_readlane_b32 s30, v40, 0
+; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-NEXT:    v_readlane_b32 s4, v40, 2
 ; WAVE32-NEXT:    s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index a0c25b2a0beb3..705d403764503 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_writelane_b32 v41, s16, 16
 ; CHECK-NEXT:    .cfi_llvm_vector_registers 65, 2601, 16, 32
 ; CHECK-NEXT:    .cfi_def_cfa_register 65
-; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 10
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
-; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 12
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
@@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v41, s55, 15
+; CHECK-NEXT:    v_writelane_b32 v41, s55, 13
 ; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 14
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 15
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 14
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s55, v41, 15
-; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index bcccf50e3805c..db48100aa7caf 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 76a2114a000cf..cba5aa8ef3672 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v42, 3
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 2e88da142bb41..6abe5998d6767 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s32, s33
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 2
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; NO-SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 0f535c3a3bdbc..6dd683e6fca53 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1740,8 +1740,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1770,8 +1770,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2159,8 +2159,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2201,8 +2201,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 9d137fb4101e4..a2f203a111e18 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_writelane_b32 v40, s27, 23
 ; SDAG-NEXT:    v_writelane_b32 v40, s28, 24
 ; SDAG-NEXT:    v_writelane_b32 v40, s29, 25
-; SDAG-NEXT:    v_writelane_b32 v40, s30, 26
-; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
-; SDAG-NEXT:    v_writelane_b32 v40, s72, 28
-; SDAG-NEXT:    v_writelane_b32 v40, s73, 29
-; SDAG-NEXT:    v_writelane_b32 v40, s74, 30
-; SDAG-NEXT:    v_writelane_b32 v40, s75, 31
-; SDAG-NEXT:    v_writelane_b32 v40, s76, 32
-; SDAG-NEXT:    v_writelane_b32 v40, s77, 33
-; SDAG-NEXT:    v_writelane_b32 v40, s78, 34
-; SDAG-NEXT:    v_writelane_b32 v40, s79, 35
-; SDAG-NEXT:    v_writelane_b32 v40, s88, 36
-; SDAG-NEXT:    v_writelane_b32 v40, s89, 37
-; SDAG-NEXT:    v_writelane_b32 v40, s90, 38
-; SDAG-NEXT:    v_writelane_b32 v40, s91, 39
-; SDAG-NEXT:    v_writelane_b32 v40, s92, 40
-; SDAG-NEXT:    v_writelane_b32 v40, s93, 41
-; SDAG-NEXT:    v_writelane_b32 v40, s94, 42
+; SDAG-NEXT:    v_writelane_b32 v40, s72, 26
+; SDAG-NEXT:    v_writelane_b32 v40, s73, 27
+; SDAG-NEXT:    v_writelane_b32 v40, s74, 28
+; SDAG-NEXT:    v_writelane_b32 v40, s75, 29
+; SDAG-NEXT:    v_writelane_b32 v40, s76, 30
+; SDAG-NEXT:    v_writelane_b32 v40, s77, 31
+; SDAG-NEXT:    v_writelane_b32 v40, s78, 32
+; SDAG-NEXT:    v_writelane_b32 v40, s79, 33
+; SDAG-NEXT:    v_writelane_b32 v40, s88, 34
+; SDAG-NEXT:    v_writelane_b32 v40, s89, 35
+; SDAG-NEXT:    v_writelane_b32 v40, s90, 36
+; SDAG-NEXT:    v_writelane_b32 v40, s91, 37
+; SDAG-NEXT:    v_writelane_b32 v40, s92, 38
+; SDAG-NEXT:    v_writelane_b32 v40, s93, 39
+; SDAG-NEXT:    v_writelane_b32 v40, s94, 40
+; SDAG-NEXT:    v_writelane_b32 v40, s95, 41
+; SDAG-NEXT:    v_writelane_b32 v40, s30, 42
 ; SDAG-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; SDAG-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; SDAG-NEXT:    s_addk_i32 s32, 0x400
-; SDAG-NEXT:    v_writelane_b32 v40, s95, 43
+; SDAG-NEXT:    v_writelane_b32 v40, s31, 43
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; SDAG-NEXT:    v_readlane_b32 s95, v40, 43
-; SDAG-NEXT:    v_readlane_b32 s94, v40, 42
-; SDAG-NEXT:    v_readlane_b32 s93, v40, 41
-; SDAG-NEXT:    v_readlane_b32 s92, v40, 40
-; SDAG-NEXT:    v_readlane_b32 s91, v40, 39
-; SDAG-NEXT:    v_readlane_b32 s90, v40, 38
-; SDAG-NEXT:    v_readlane_b32 s89, v40, 37
-; SDAG-NEXT:    v_readlane_b32 s88, v40, 36
-; SDAG-NEXT:    v_readlane_b32 s79, v40, 35
-; SDAG-NEXT:    v_readlane_b32 s78, v40, 34
-; SDAG-NEXT:    v_readlane_b32 s77, v40, 33
-; SDAG-NEXT:    v_readlane_b32 s76, v40, 32
-; SDAG-NEXT:    v_readlane_b32 s75, v40, 31
-; SDAG-NEXT:    v_readlane_b32 s74, v40, 30
-; SDAG-NEXT:    v_readlane_b32 s73, v40, 29
-; SDAG-NEXT:    v_readlane_b32 s72, v40, 28
-; SDAG-NEXT:    v_readlane_b32 s31, v40, 27
-; SDAG-NEXT:    v_readlane_b32 s30, v40, 26
+; SDAG-NEXT:    v_readlane_b32 s30, v40, 42
+; SDAG-NEXT:    v_readlane_b32 s31, v40, 43
+; SDAG-NEXT:    v_readlane_b32 s95, v40, 41
+; SDAG-NEXT:    v_readlane_b32 s94, v40, 40
+; SDAG-NEXT:    v_readlane_b32 s93, v40, 39
+; SDAG-NEXT:    v_readlane_b32 s92, v40, 38
+; SDAG-NEXT:    v_readlane_b32 s91, v40, 37
+; SDAG-NEXT:    v_readlane_b32 s90, v40, 36
+; SDAG-NEXT:    v_readlane_b32 s89, v40, 35
+; SDAG-NEXT:    v_readlane_b32 s88, v40, 34
+; SDAG-NEXT:    v_readlane_b32 s79, v40, 33
+; SDAG-NEXT:    v_readlane_b32 s78, v40, 32
+; SDAG-NEXT:    v_readlane_b32 s77, v40, 31
+; SDAG-NEXT:    v_readlane_b32 s76, v40, 30
+; SDAG-NEXT:    v_readlane_b32 s75, v40, 29
+; SDAG-NEXT:    v_readlane_b32 s74, v40, 28
+; SDAG-NEXT:    v_readlane_b32 s73, v40, 27
+; SDAG-NEXT:    v_readlane_b32 s72, v40, 26
 ; SDAG-NEXT:    v_readlane_b32 s29, v40, 25
 ; SDAG-NEXT:    v_readlane_b32 s28, v40, 24
 ; SDAG-NEXT:    v_readlane_b32 s27, v40, 23
@@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_writelane_b32 v40, s27, 23
 ; GISEL-NEXT:    v_writelane_b32 v40, s28, 24
 ; GISEL-NEXT:    v_writelane_b32 v40, s29, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s72, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s73, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s74, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s75, 31
-; GISEL-NEXT:    v_writelane_b32 v40, s76, 32
-; GISEL-NEXT:    v_writelane_b32 v40, s77, 33
-; GISEL-NEXT:    v_writelane_b32 v40, s78, 34
-; GISEL-NEXT:    v_writelane_b32 v40, s79, 35
-; GISEL-NEXT:    v_writelane_b32 v40, s88, 36
-; GISEL-NEXT:    v_writelane_b32 v40, s89, 37
-; GISEL-NEXT:    v_writelane_b32 v40, s90, 38
-; GISEL-NEXT:    v_writelane_b32 v40, s91, 39
-; GISEL-NEXT:    v_writelane_b32 v40, s92, 40
-; GISEL-NEXT:    v_writelane_b32 v40, s93, 41
-; GISEL-NEXT:    v_writelane_b32 v40, s94, 42
+; GISEL-NEXT:    v_writelane_b32 v40, s72, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s73, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s74, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s75, 29
+; GISEL-NEXT:    v_writelane_b32 v40, s76, 30
+; GISEL-NEXT:    v_writelane_b32 v40, s77, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s78, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s79, 33
+; GISEL-NEXT:    v_writelane_b32 v40, s88, 34
+; GISEL-NEXT:    v_writelane_b32 v40, s89, 35
+; GISEL-NEXT:    v_writelane_b32 v40, s90, 36
+; GISEL-NEXT:    v_writelane_b32 v40, s91, 37
+; GISEL-NEXT:    v_writelane_b32 v40, s92, 38
+; GISEL-NEXT:    v_writelane_b32 v40, s93, 39
+; GISEL-NEXT:    v_writelane_b32 v40, s94, 40
+; GISEL-NEXT:    v_writelane_b32 v40, s95, 41
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 42
 ; GISEL-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; GISEL-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s95, 43
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 43
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GISEL-NEXT:    v_readlane_b32 s95, v40, 43
-; GISEL-NEXT:    v_readlane_b32 s94, v40, 42
-; GISEL-NEXT:    v_readlane_b32 s93, v40, 41
-; GISEL-NEXT:    v_readlane_b32 s92, v40, 40
-; GISEL-NEXT:    v_readlane_b32 s91, v40, 39
-; GISEL-NEXT:    v_readlane_b32 s90, v40, 38
-; GISEL-NEXT:    v_readlane_b32 s89, v40, 37
-; GISEL-NEXT:    v_readlane_b32 s88, v40, 36
-; GISEL-NEXT:    v_readlane_b32 s79, v40, 35
-; GISEL-NEXT:    v_readlane_b32 s78, v40, 34
-; GISEL-NEXT:    v_readlane_b32 s77, v40, 33
-; GISEL-NEXT:    v_readlane_b32 s76, v40, 32
-; GISEL-NEXT:    v_readlane_b32 s75, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s74, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s73, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s72, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 42
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 43
+; GISEL-NEXT:    v_readlane_b32 s95, v40, 41
+; GISEL-NEXT:    v_readlane_b32 s94, v40, 40
+; GISEL-NEXT:    v_readlane_b32 s93, v40, 39
+; GISEL-NEXT:    v_readlane_b32 s92, v40, 38
+; GISEL-NEXT:    v_readlane_b32 s91, v40, 37
+; GISEL-NEXT:    v_readlane_b32 s90, v40, 36
+; GISEL-NEXT:    v_readlane_b32 s89, v40, 35
+; GISEL-NEXT:    v_readlane_b32 s88, v40, 34
+; GISEL-NEXT:    v_readlane_b32 s79, v40, 33
+; GISEL-NEXT:    v_readlane_b32 s78, v40, 32
+; GISEL-NEXT:    v_readlane_b32 s77, v40, 31
+; GISEL-NEXT:    v_readlane_b32 s76, v40, 30
+; GISEL-NEXT:    v_readlane_b32 s75, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s74, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s73, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s72, v40, 26
 ; GISEL-NEXT:    v_readlane_b32 s29, v40, 25
 ; GISEL-NEXT:    v_readlane_b32 s28, v40, 24
 ; GISEL-NEXT:    v_readlane_b32 s27, v40, 23
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 234eaa8af7edf..39231558934a7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1713,8 +1713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1743,8 +1743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1779,8 +1779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1810,8 +1810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1840,8 +1840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1871,8 +1871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1907,8 +1907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1939,8 +1939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1970,8 +1970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2002,8 +2002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2077,8 +2077,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2109,8 +2109,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2143,8 +2143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2177,8 +2177,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2205,8 +2205,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2234,8 +2234,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2262,8 +2262,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2290,8 +2290,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2322,8 +2322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2350,8 +2350,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2379,8 +2379,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2407,8 +2407,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2440,8 +2440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2469,8 +2469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2498,8 +2498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2527,8 +2527,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2561,8 +2561,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2591,8 +2591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2621,8 +2621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2651,8 +2651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2687,8 +2687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2719,8 +2719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2750,8 +2750,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2782,8 +2782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2815,8 +2815,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2844,8 +2844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2873,8 +2873,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2902,8 +2902,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2937,8 +2937,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2968,8 +2968,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2998,8 +2998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3029,8 +3029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3066,8 +3066,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3099,8 +3099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3130,8 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3163,8 +3163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3200,8 +3200,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3233,8 +3233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3267,8 +3267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3300,8 +3300,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3333,8 +3333,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3371,8 +3371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3404,8 +3404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3437,8 +3437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3470,8 +3470,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3509,8 +3509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3543,8 +3543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3577,8 +3577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3611,8 +3611,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3652,8 +3652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3688,8 +3688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3724,8 +3724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3760,8 +3760,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3804,8 +3804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3843,8 +3843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3881,8 +3881,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3920,8 +3920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3996,8 +3996,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX9-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4068,8 +4068,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4135,8 +4135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
 ; GFX11-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4207,8 +4207,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4249,8 +4249,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4285,8 +4285,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4322,8 +4322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4358,8 +4358,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4394,8 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4442,8 +4442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4478,8 +4478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4522,8 +4522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
@@ -4566,8 +4566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4607,8 +4607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4663,8 +4663,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4701,8 +4701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
@@ -4743,20 +4743,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[3:4], v2, off
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4795,8 +4795,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
@@ -4840,8 +4840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
@@ -4898,8 +4898,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4936,8 +4936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4985,11 +4985,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v0, off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
@@ -5033,8 +5033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
@@ -5082,8 +5082,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5146,8 +5146,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5186,8 +5186,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5242,7 +5242,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[2:3], v4, off
@@ -5251,7 +5251,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5293,8 +5292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
@@ -5347,8 +5346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5419,8 +5418,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5464,12 +5463,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
@@ -5527,14 +5526,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[3:4], off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5585,12 +5583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
@@ -5642,12 +5640,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
@@ -5781,8 +5779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5904,8 +5902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6032,8 +6030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6183,8 +6181,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6306,8 +6304,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6342,8 +6340,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6370,8 +6368,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6399,8 +6397,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6427,8 +6425,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6460,8 +6458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6488,8 +6486,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6517,8 +6515,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6545,8 +6543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6578,8 +6576,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6606,8 +6604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6635,8 +6633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6663,8 +6661,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6697,8 +6695,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6726,8 +6724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6755,8 +6753,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6784,8 +6782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6817,8 +6815,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6846,8 +6844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6876,8 +6874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6905,8 +6903,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6937,8 +6935,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6965,8 +6963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6994,8 +6992,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7022,8 +7020,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7056,8 +7054,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7085,8 +7083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7115,8 +7113,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7144,8 +7142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7176,8 +7174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7204,8 +7202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7233,8 +7231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7261,8 +7259,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7294,8 +7292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7322,8 +7320,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7351,8 +7349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7379,8 +7377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7413,8 +7411,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7442,8 +7440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7471,8 +7469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7500,8 +7498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7534,8 +7532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7564,8 +7562,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7594,8 +7592,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7624,8 +7622,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7659,8 +7657,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7690,8 +7688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7720,8 +7718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7751,8 +7749,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7783,8 +7781,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7811,8 +7809,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7840,8 +7838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7868,8 +7866,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7904,8 +7902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7935,8 +7933,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7965,8 +7963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7996,8 +7994,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8032,8 +8030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8064,8 +8062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8095,8 +8093,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8127,8 +8125,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8163,8 +8161,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8196,8 +8194,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8230,8 +8228,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8263,8 +8261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8304,8 +8302,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8339,8 +8337,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8371,8 +8369,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8406,8 +8404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8444,8 +8442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8479,8 +8477,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8515,8 +8513,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8550,8 +8548,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8595,8 +8593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8634,8 +8632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8674,8 +8672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8713,8 +8711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8761,8 +8759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8803,8 +8801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8845,8 +8843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8887,8 +8885,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8930,8 +8928,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8967,8 +8965,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9004,8 +9002,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9041,8 +9039,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9078,8 +9076,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9111,8 +9109,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9145,8 +9143,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9178,8 +9176,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9211,8 +9209,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9249,8 +9247,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9281,8 +9279,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9315,8 +9313,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9347,8 +9345,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9379,8 +9377,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9424,8 +9422,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -9466,8 +9464,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -9508,8 +9506,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s33 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9548,8 +9546,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, s33 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9590,8 +9588,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
@@ -9662,8 +9660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9713,8 +9711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9761,8 +9759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9812,8 +9810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9841,46 +9839,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 14
 ; GFX9-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9902,7 +9900,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
@@ -9910,38 +9908,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9962,44 +9960,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s33
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -10020,44 +10018,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -10091,8 +10089,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -10120,8 +10118,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -10150,8 +10148,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -10179,8 +10177,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -10212,8 +10210,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10242,8 +10240,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10273,8 +10271,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10303,8 +10301,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10337,8 +10335,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10367,8 +10365,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10398,8 +10396,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10428,8 +10426,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10462,8 +10460,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10492,8 +10490,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10523,8 +10521,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10553,8 +10551,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10589,8 +10587,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -10622,8 +10620,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -10656,8 +10654,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -10689,8 +10687,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -10728,8 +10726,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10765,8 +10763,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10803,8 +10801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10840,8 +10838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10884,8 +10882,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10923,8 +10921,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10963,8 +10961,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -11002,8 +11000,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -11047,8 +11045,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -11090,8 +11088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -11134,8 +11132,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -11177,8 +11175,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -11231,8 +11229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11280,8 +11278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11330,8 +11328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11379,8 +11377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11422,8 +11420,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11452,8 +11450,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11483,8 +11481,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11513,8 +11511,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11547,8 +11545,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11577,8 +11575,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11608,8 +11606,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11638,8 +11636,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11674,8 +11672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -11707,8 +11705,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -11741,8 +11739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -11774,8 +11772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -11813,8 +11811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11849,8 +11847,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11886,8 +11884,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11922,8 +11920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11966,8 +11964,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -12008,8 +12006,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -12051,8 +12049,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -12093,8 +12091,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -12133,8 +12131,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12166,8 +12164,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12200,8 +12198,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12233,8 +12231,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12274,8 +12272,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12313,8 +12311,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12353,8 +12351,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12392,8 +12390,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12439,8 +12437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12484,8 +12482,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12530,8 +12528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12575,8 +12573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12614,8 +12612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12644,8 +12642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12675,8 +12673,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12705,8 +12703,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12741,8 +12739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12773,8 +12771,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12806,8 +12804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12838,8 +12836,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12875,8 +12873,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12907,8 +12905,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12940,8 +12938,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12972,8 +12970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13010,8 +13008,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13043,8 +13041,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13077,8 +13075,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13110,8 +13108,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13147,8 +13145,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13180,8 +13178,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13214,8 +13212,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13247,8 +13245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13283,8 +13281,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13315,8 +13313,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13348,8 +13346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13380,8 +13378,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13418,8 +13416,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13451,8 +13449,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13485,8 +13483,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13518,8 +13516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13553,8 +13551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13583,8 +13581,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13614,8 +13612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13644,8 +13642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13680,8 +13678,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13712,8 +13710,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13745,8 +13743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13777,8 +13775,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13815,8 +13813,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13848,8 +13846,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13882,8 +13880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13915,8 +13913,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13954,8 +13952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13990,8 +13988,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -14027,8 +14025,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -14063,8 +14061,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -14105,8 +14103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14144,8 +14142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14184,8 +14182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14223,8 +14221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14263,8 +14261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14299,8 +14297,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14336,8 +14334,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14372,8 +14370,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14416,8 +14414,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14455,8 +14453,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14495,8 +14493,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14534,8 +14532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14579,8 +14577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14621,8 +14619,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14664,8 +14662,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14706,8 +14704,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14753,8 +14751,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14799,8 +14797,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14846,8 +14844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14892,8 +14890,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14949,8 +14947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -15000,8 +14998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -15052,8 +15050,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -15103,8 +15101,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -15161,8 +15159,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15223,8 +15221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15286,8 +15284,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15348,8 +15346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15450,8 +15448,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15557,8 +15555,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15660,8 +15658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15764,8 +15762,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15881,8 +15879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15993,8 +15991,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -16100,8 +16098,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -16210,8 +16208,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -16276,8 +16274,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16310,8 +16308,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16340,8 +16338,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16370,8 +16368,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16442,8 +16440,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16509,8 +16507,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16556,8 +16554,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16620,8 +16618,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16712,8 +16710,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16787,8 +16785,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16838,8 +16836,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16908,8 +16906,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16996,8 +16994,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17071,8 +17069,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17127,8 +17125,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17197,8 +17195,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17237,8 +17235,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17264,8 +17262,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17292,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17319,8 +17317,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17351,8 +17349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17378,8 +17376,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17406,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17433,8 +17431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17465,8 +17463,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17492,8 +17490,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17520,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17547,8 +17545,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17579,8 +17577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17606,8 +17604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17634,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17661,8 +17659,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17693,8 +17691,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17720,8 +17718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17748,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17775,8 +17773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17807,8 +17805,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17834,8 +17832,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17862,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17889,8 +17887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17921,8 +17919,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17948,8 +17946,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17976,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18003,8 +18001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18035,8 +18033,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18062,8 +18060,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18090,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18117,8 +18115,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18149,8 +18147,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18176,8 +18174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18204,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18231,8 +18229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18263,8 +18261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18290,8 +18288,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18318,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18345,8 +18343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18377,8 +18375,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18404,8 +18402,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18432,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18459,8 +18457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18491,8 +18489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18518,8 +18516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18546,8 +18544,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18573,8 +18571,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18605,8 +18603,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18632,8 +18630,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18660,8 +18658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18687,8 +18685,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18719,8 +18717,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18746,8 +18744,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18774,8 +18772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18801,8 +18799,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 124de7e00f020..ecf2b33841076 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX9-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
@@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX10-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
@@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-NEXT:    ; clobber
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX11-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v31
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s33
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s33, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s33
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s33
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s34, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s34
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s34, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s34
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s34, s4
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s34
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s4
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index d8df20eb69452..4c7bef4aec091 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -35,8 +35,8 @@ define void @bar() {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..ae13753004c3d 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v6, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v6, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v6, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v6, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v6, s48, 6
-; CHECK-NEXT:    v_writelane_b32 v6, s49, 7
-; CHECK-NEXT:    v_writelane_b32 v6, s50, 8
-; CHECK-NEXT:    v_writelane_b32 v6, s51, 9
-; CHECK-NEXT:    v_writelane_b32 v6, s52, 10
-; CHECK-NEXT:    v_writelane_b32 v6, s53, 11
-; CHECK-NEXT:    v_writelane_b32 v6, s54, 12
-; CHECK-NEXT:    v_writelane_b32 v6, s55, 13
-; CHECK-NEXT:    v_writelane_b32 v6, s64, 14
-; CHECK-NEXT:    v_writelane_b32 v6, s65, 15
-; CHECK-NEXT:    v_writelane_b32 v6, s66, 16
-; CHECK-NEXT:    v_writelane_b32 v6, s67, 17
-; CHECK-NEXT:    v_writelane_b32 v6, s68, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s68, 16
+; CHECK-NEXT:    v_writelane_b32 v6, s69, 17
+; CHECK-NEXT:    v_writelane_b32 v6, s70, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s71, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 20
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    v_writelane_b32 v6, s69, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 21
 ; CHECK-NEXT:    s_mov_b32 s68, 0
 ; CHECK-NEXT:    s_mov_b32 s69, s4
 ; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
@@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x130
 ; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v6, s70, 20
-; CHECK-NEXT:    v_writelane_b32 v6, s71, 21
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
+; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s8, 0
 ; CHECK-NEXT:    v_writelane_b32 v7, s9, 1
 ; CHECK-NEXT:    v_writelane_b32 v7, s10, 2
@@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v7, s65, 29
 ; CHECK-NEXT:    v_writelane_b32 v7, s66, 30
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x1f0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s69, s68
-; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    s_mov_b32 s71, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
 ; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
@@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_readlane_b32 s71, v6, 21
-; CHECK-NEXT:    v_readlane_b32 s70, v6, 20
-; CHECK-NEXT:    v_readlane_b32 s69, v6, 19
-; CHECK-NEXT:    v_readlane_b32 s68, v6, 18
-; CHECK-NEXT:    v_readlane_b32 s67, v6, 17
-; CHECK-NEXT:    v_readlane_b32 s66, v6, 16
-; CHECK-NEXT:    v_readlane_b32 s65, v6, 15
-; CHECK-NEXT:    v_readlane_b32 s64, v6, 14
-; CHECK-NEXT:    v_readlane_b32 s55, v6, 13
-; CHECK-NEXT:    v_readlane_b32 s54, v6, 12
-; CHECK-NEXT:    v_readlane_b32 s53, v6, 11
-; CHECK-NEXT:    v_readlane_b32 s52, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s71, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s70, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s69, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s68, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 9
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 8
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 7
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v6, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a208cfdb197af..2aaaff1ecc407 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB2_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 20
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
-; GCN-NEXT:    v_writelane_b32 v40, s66, 18
-; GCN-NEXT:    v_writelane_b32 v40, s67, 19
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s66, 16
+; GCN-NEXT:    v_writelane_b32 v40, s67, 17
+; GCN-NEXT:    v_writelane_b32 v40, s30, 18
+; GCN-NEXT:    v_writelane_b32 v40, s31, 19
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[64:65]
 ; GCN-NEXT:  .LBB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s67, v40, 19
-; GCN-NEXT:    v_readlane_b32 s66, v40, 18
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 18
+; GCN-NEXT:    v_readlane_b32 s31, v40, 19
+; GCN-NEXT:    v_readlane_b32 s67, v40, 17
+; GCN-NEXT:    v_readlane_b32 s66, v40, 16
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 20
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 20
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s66, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s67, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s66, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s67, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 19
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[64:65]
 ; GISEL-NEXT:  .LBB5_4: ; %bb2
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s67, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s66, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s67, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s66, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 20
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
@@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
@@ -944,22 +944,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -982,22 +982,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1013,22 +1013,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v40
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1048,22 +1048,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v41, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v41, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v41, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v41, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v41, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v41, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v41, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v41, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v41, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v41, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v41, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v41, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v41, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v41, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v41, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v41, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v41, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v41, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v41, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s31, 15
 ; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1079,22 +1079,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v40
-; GISEL-NEXT:    v_readlane_b32 s55, v41, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v41, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v41, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v41, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v41, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v41, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v41, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v41, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v41, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v41, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v41, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v41, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v41, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v41, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v41, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v41, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v41, 0
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1121,22 +1121,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1152,22 +1152,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1185,22 +1185,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1216,22 +1216,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1254,22 +1254,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1282,22 +1282,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1315,22 +1315,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1343,22 +1343,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB9_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index c3f391786f878..fcc43ffd0140e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -35,8 +35,8 @@ define void @f0() {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index df784a6e00cfc..19869e85ec9d9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 {
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_WAITCNT 0
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
   ; CHECK-NEXT:   S_WAITCNT 49279
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
@@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.DummyReturnBlock:
+  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
-  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
   ; CHECK-NEXT:   $sgpr32 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 killed $sgpr4
   ; CHECK-NEXT:   S_WAITCNT 16240
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 17581bcb61e99..8c2c16ccdc2a0 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_store_dword off, v23, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX942-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
@@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
@@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
@@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX7-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX7-NEXT:    ;;#ASMEND
@@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX8-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX8-NEXT:    ;;#ASMEND
@@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX900-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX900-NEXT:    ;;#ASMEND
@@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_store_dword off, v21, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX942-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX942-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
@@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_1-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX11-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
@@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_writelane_b32 v23, s28, 17
 ; GFX7-NEXT:    v_writelane_b32 v23, s29, 18
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX7-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
 ; GFX7-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX7-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    v_readlane_b32 s28, v23, 17
 ; GFX7-NEXT:    v_readlane_b32 s29, v23, 18
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX8-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX8-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX900-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX900-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX900-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_store_dword off, v22, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v22, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v22, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v22, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v22, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
-; GFX942-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_load_dword v22, off, s2 ; 4-byte Folded Reload
@@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_1-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_1-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_3-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_add_i32 s58, s32, 0x4240
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload
@@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX12-NEXT:    s_add_co_i32 s58, s32, 0x4200
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 33cd598aae9b5..7b87cb014ecbf 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_writelane_b32 v43, s4, 5
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v43, s36, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s37, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 2
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 4
 ; GFX9-NEXT:    s_mov_b32 s34, s15
 ; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s37, v43, 4
-; GFX9-NEXT:    v_readlane_b32 s36, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 65446a036c91b..878302e4865bb 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; clobber csr v40
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index ccaf0ac5377e4..8394b325bee6d 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 6b6c60ebe2a9e..133cc166c3311 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index dba10f19eb500..ef5681798fb19 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() {
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 925984b15367d..4a2829947f9a0 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,16 +28,16 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.68, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
@@ -66,48 +66,48 @@ body:             |
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
@@ -130,48 +130,48 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
   ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
   ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
   ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
@@ -200,11 +200,11 @@ body:             |
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT:   $sgpr32 = frame-destroy COPY $sgpr33
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.68, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 702953c56a5cb..cb54b0ba629c3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v255, 0
+; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v254, 0
+; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -1632,21 +1632,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    v_writelane_b32 v0, s30, 0
+; GCN-NEXT:    v_writelane_b32 v0, s31, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    v_writelane_b32 v0, s31, 0
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function_ipra at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function_ipra at rel32@hi+12
@@ -1656,20 +1649,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v0, 0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s30, v0, 0
+; GCN-NEXT:    v_readlane_b32 s31, v0, 1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 1c2215d39dc02..feaca47f98e36 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s30
 ; GFX900-NEXT:    s_mov_b32 s13, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s30
 ; GFX90A-NEXT:    s_mov_b32 s13, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s30
 ; GFX942-NEXT:    s_mov_b32 s13, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s16
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s17
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s20
 ; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
 ; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s20
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s22
 ; GFX900-NEXT:    s_mov_b32 s11, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s22
 ; GFX90A-NEXT:    s_mov_b32 s11, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s22
 ; GFX942-NEXT:    s_mov_b32 s11, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX900-NEXT:    s_mov_b32 s26, s12
 ; GFX900-NEXT:    s_mov_b32 s27, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s12
 ; GFX90A-NEXT:    s_mov_b32 s27, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX900-NEXT:    s_mov_b32 s31, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s18
 ; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s18
 ; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s22
 ; GFX900-NEXT:    s_mov_b32 s13, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s22
 ; GFX90A-NEXT:    s_mov_b32 s13, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX900-NEXT:    s_mov_b32 s26, s14
 ; GFX900-NEXT:    s_mov_b32 s27, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s14
 ; GFX90A-NEXT:    s_mov_b32 s27, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s26
 ; GFX900-NEXT:    s_mov_b32 s13, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s26
 ; GFX90A-NEXT:    s_mov_b32 s13, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX900-NEXT:    s_mov_b32 s31, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    s_mov_b32 s31, s13
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s18
 ; GFX942-NEXT:    s_mov_b32 s13, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s22
 ; GFX942-NEXT:    s_mov_b32 s13, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s26
 ; GFX942-NEXT:    s_mov_b32 s13, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    s_mov_b32 s31, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX900-NEXT:    s_mov_b32 s14, s18
 ; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s18
 ; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:23]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:23]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s26
 ; GFX942-NEXT:    s_mov_b32 s9, s27
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    s_mov_b32 s14, s18
 ; GFX942-NEXT:    s_mov_b32 s15, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX900-NEXT:    s_mov_b32 s14, s20
 ; GFX900-NEXT:    s_mov_b32 s15, s21
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s20
 ; GFX90A-NEXT:    s_mov_b32 s15, s21
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s20
@@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
@@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    s_mov_b32 s14, s20
 ; GFX942-NEXT:    s_mov_b32 s15, s21
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s20
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s22
 ; GFX942-NEXT:    s_mov_b32 s9, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX900-NEXT:    s_mov_b32 s14, s22
 ; GFX900-NEXT:    s_mov_b32 s15, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s22
 ; GFX90A-NEXT:    s_mov_b32 s15, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    s_mov_b32 s14, s22
 ; GFX942-NEXT:    s_mov_b32 s15, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX900-NEXT:    s_mov_b32 s14, s24
 ; GFX900-NEXT:    s_mov_b32 s15, s25
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s24
 ; GFX90A-NEXT:    s_mov_b32 s15, s25
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s24
@@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s24
@@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    s_mov_b32 s14, s24
 ; GFX942-NEXT:    s_mov_b32 s15, s25
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s24
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s25
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX900-NEXT:    s_mov_b32 s14, s26
 ; GFX900-NEXT:    s_mov_b32 s15, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s26
 ; GFX90A-NEXT:    s_mov_b32 s15, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX900-NEXT:    s_mov_b32 s24, s14
 ; GFX900-NEXT:    s_mov_b32 s25, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX90A-NEXT:    s_mov_b32 s24, s14
 ; GFX90A-NEXT:    s_mov_b32 s25, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    s_mov_b32 s14, s26
 ; GFX942-NEXT:    s_mov_b32 s15, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX900-NEXT:    s_mov_b32 s14, s28
 ; GFX900-NEXT:    s_mov_b32 s15, s29
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s28
 ; GFX90A-NEXT:    s_mov_b32 s15, s29
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s28
@@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s28
@@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    s_mov_b32 s14, s28
 ; GFX942-NEXT:    s_mov_b32 s15, s29
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s28
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s29
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX900-NEXT:    s_mov_b32 s14, s30
 ; GFX900-NEXT:    s_mov_b32 s15, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s30
 ; GFX90A-NEXT:    s_mov_b32 s15, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX900-NEXT:    s_mov_b32 s29, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX90A-NEXT:    s_mov_b32 s29, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    s_mov_b32 s14, s30
 ; GFX942-NEXT:    s_mov_b32 s15, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s28, s14
 ; GFX942-NEXT:    s_mov_b32 s29, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 1ffef8e60d90d..ea67593d72761 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -24,10 +24,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -89,10 +89,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -152,10 +152,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 00214ef36e1f0..d4b53e9d338f8 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
 ; GCN-NEXT:    v_mov_b32_e32 v1, v40
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
 ; GCN-NEXT:    v_readlane_b32 s31, v42, 1
-; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s6, v42, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
@@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FIJI-NEXT:    s_mov_b64 exec, s[18:19]
 ; FIJI-NEXT:    v_writelane_b32 v40, s16, 18
-; FIJI-NEXT:    v_writelane_b32 v40, s30, 0
-; FIJI-NEXT:    v_writelane_b32 v40, s31, 1
-; FIJI-NEXT:    v_writelane_b32 v40, s34, 2
-; FIJI-NEXT:    v_writelane_b32 v40, s35, 3
-; FIJI-NEXT:    v_writelane_b32 v40, s36, 4
-; FIJI-NEXT:    v_writelane_b32 v40, s37, 5
-; FIJI-NEXT:    v_writelane_b32 v40, s38, 6
-; FIJI-NEXT:    v_writelane_b32 v40, s39, 7
-; FIJI-NEXT:    v_writelane_b32 v40, s48, 8
-; FIJI-NEXT:    v_writelane_b32 v40, s49, 9
-; FIJI-NEXT:    v_writelane_b32 v40, s50, 10
-; FIJI-NEXT:    v_writelane_b32 v40, s51, 11
-; FIJI-NEXT:    v_writelane_b32 v40, s52, 12
-; FIJI-NEXT:    v_writelane_b32 v40, s53, 13
-; FIJI-NEXT:    v_writelane_b32 v40, s54, 14
-; FIJI-NEXT:    v_writelane_b32 v40, s55, 15
-; FIJI-NEXT:    v_writelane_b32 v40, s64, 16
+; FIJI-NEXT:    v_writelane_b32 v40, s34, 0
+; FIJI-NEXT:    v_writelane_b32 v40, s35, 1
+; FIJI-NEXT:    v_writelane_b32 v40, s36, 2
+; FIJI-NEXT:    v_writelane_b32 v40, s37, 3
+; FIJI-NEXT:    v_writelane_b32 v40, s38, 4
+; FIJI-NEXT:    v_writelane_b32 v40, s39, 5
+; FIJI-NEXT:    v_writelane_b32 v40, s48, 6
+; FIJI-NEXT:    v_writelane_b32 v40, s49, 7
+; FIJI-NEXT:    v_writelane_b32 v40, s50, 8
+; FIJI-NEXT:    v_writelane_b32 v40, s51, 9
+; FIJI-NEXT:    v_writelane_b32 v40, s52, 10
+; FIJI-NEXT:    v_writelane_b32 v40, s53, 11
+; FIJI-NEXT:    v_writelane_b32 v40, s54, 12
+; FIJI-NEXT:    v_writelane_b32 v40, s55, 13
+; FIJI-NEXT:    v_writelane_b32 v40, s64, 14
+; FIJI-NEXT:    v_writelane_b32 v40, s65, 15
+; FIJI-NEXT:    v_writelane_b32 v40, s30, 16
 ; FIJI-NEXT:    s_mov_b32 s50, s15
 ; FIJI-NEXT:    s_mov_b32 s51, s14
 ; FIJI-NEXT:    s_mov_b32 s52, s13
@@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; FIJI-NEXT:    s_mov_b64 s[54:55], exec
 ; FIJI-NEXT:    s_addk_i32 s32, 0x400
-; FIJI-NEXT:    v_writelane_b32 v40, s65, 17
+; FIJI-NEXT:    v_writelane_b32 v40, s31, 17
 ; FIJI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; FIJI-NEXT:    v_readfirstlane_b32 s16, v0
 ; FIJI-NEXT:    v_readfirstlane_b32 s17, v1
@@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    s_cbranch_execnz .LBB18_1
 ; FIJI-NEXT:  ; %bb.2:
 ; FIJI-NEXT:    s_mov_b64 exec, s[54:55]
+; FIJI-NEXT:    v_readlane_b32 s30, v40, 16
 ; FIJI-NEXT:    v_mov_b32_e32 v0, v4
-; FIJI-NEXT:    v_readlane_b32 s65, v40, 17
-; FIJI-NEXT:    v_readlane_b32 s64, v40, 16
-; FIJI-NEXT:    v_readlane_b32 s55, v40, 15
-; FIJI-NEXT:    v_readlane_b32 s54, v40, 14
-; FIJI-NEXT:    v_readlane_b32 s53, v40, 13
-; FIJI-NEXT:    v_readlane_b32 s52, v40, 12
-; FIJI-NEXT:    v_readlane_b32 s51, v40, 11
-; FIJI-NEXT:    v_readlane_b32 s50, v40, 10
-; FIJI-NEXT:    v_readlane_b32 s49, v40, 9
-; FIJI-NEXT:    v_readlane_b32 s48, v40, 8
-; FIJI-NEXT:    v_readlane_b32 s39, v40, 7
-; FIJI-NEXT:    v_readlane_b32 s38, v40, 6
-; FIJI-NEXT:    v_readlane_b32 s37, v40, 5
-; FIJI-NEXT:    v_readlane_b32 s36, v40, 4
-; FIJI-NEXT:    v_readlane_b32 s35, v40, 3
-; FIJI-NEXT:    v_readlane_b32 s34, v40, 2
-; FIJI-NEXT:    v_readlane_b32 s31, v40, 1
-; FIJI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIJI-NEXT:    v_readlane_b32 s31, v40, 17
+; FIJI-NEXT:    v_readlane_b32 s65, v40, 15
+; FIJI-NEXT:    v_readlane_b32 s64, v40, 14
+; FIJI-NEXT:    v_readlane_b32 s55, v40, 13
+; FIJI-NEXT:    v_readlane_b32 s54, v40, 12
+; FIJI-NEXT:    v_readlane_b32 s53, v40, 11
+; FIJI-NEXT:    v_readlane_b32 s52, v40, 10
+; FIJI-NEXT:    v_readlane_b32 s51, v40, 9
+; FIJI-NEXT:    v_readlane_b32 s50, v40, 8
+; FIJI-NEXT:    v_readlane_b32 s49, v40, 7
+; FIJI-NEXT:    v_readlane_b32 s48, v40, 6
+; FIJI-NEXT:    v_readlane_b32 s39, v40, 5
+; FIJI-NEXT:    v_readlane_b32 s38, v40, 4
+; FIJI-NEXT:    v_readlane_b32 s37, v40, 3
+; FIJI-NEXT:    v_readlane_b32 s36, v40, 2
+; FIJI-NEXT:    v_readlane_b32 s35, v40, 1
+; FIJI-NEXT:    v_readlane_b32 s34, v40, 0
 ; FIJI-NEXT:    s_mov_b32 s32, s33
 ; FIJI-NEXT:    v_readlane_b32 s4, v40, 18
 ; FIJI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HAWAII-NEXT:    s_mov_b64 exec, s[18:19]
 ; HAWAII-NEXT:    v_writelane_b32 v40, s16, 18
-; HAWAII-NEXT:    v_writelane_b32 v40, s30, 0
-; HAWAII-NEXT:    v_writelane_b32 v40, s31, 1
-; HAWAII-NEXT:    v_writelane_b32 v40, s34, 2
-; HAWAII-NEXT:    v_writelane_b32 v40, s35, 3
-; HAWAII-NEXT:    v_writelane_b32 v40, s36, 4
-; HAWAII-NEXT:    v_writelane_b32 v40, s37, 5
-; HAWAII-NEXT:    v_writelane_b32 v40, s38, 6
-; HAWAII-NEXT:    v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT:    v_writelane_b32 v40, s48, 8
-; HAWAII-NEXT:    v_writelane_b32 v40, s49, 9
-; HAWAII-NEXT:    v_writelane_b32 v40, s50, 10
-; HAWAII-NEXT:    v_writelane_b32 v40, s51, 11
-; HAWAII-NEXT:    v_writelane_b32 v40, s52, 12
-; HAWAII-NEXT:    v_writelane_b32 v40, s53, 13
-; HAWAII-NEXT:    v_writelane_b32 v40, s54, 14
-; HAWAII-NEXT:    v_writelane_b32 v40, s55, 15
-; HAWAII-NEXT:    v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT:    v_writelane_b32 v40, s34, 0
+; HAWAII-NEXT:    v_writelane_b32 v40, s35, 1
+; HAWAII-NEXT:    v_writelane_b32 v40, s36, 2
+; HAWAII-NEXT:    v_writelane_b32 v40, s37, 3
+; HAWAII-NEXT:    v_writelane_b32 v40, s38, 4
+; HAWAII-NEXT:    v_writelane_b32 v40, s39, 5
+; HAWAII-NEXT:    v_writelane_b32 v40, s48, 6
+; HAWAII-NEXT:    v_writelane_b32 v40, s49, 7
+; HAWAII-NEXT:    v_writelane_b32 v40, s50, 8
+; HAWAII-NEXT:    v_writelane_b32 v40, s51, 9
+; HAWAII-NEXT:    v_writelane_b32 v40, s52, 10
+; HAWAII-NEXT:    v_writelane_b32 v40, s53, 11
+; HAWAII-NEXT:    v_writelane_b32 v40, s54, 12
+; HAWAII-NEXT:    v_writelane_b32 v40, s55, 13
+; HAWAII-NEXT:    v_writelane_b32 v40, s64, 14
+; HAWAII-NEXT:    v_writelane_b32 v40, s65, 15
+; HAWAII-NEXT:    v_writelane_b32 v40, s30, 16
 ; HAWAII-NEXT:    s_mov_b32 s50, s15
 ; HAWAII-NEXT:    s_mov_b32 s51, s14
 ; HAWAII-NEXT:    s_mov_b32 s52, s13
@@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; HAWAII-NEXT:    s_mov_b64 s[54:55], exec
 ; HAWAII-NEXT:    s_addk_i32 s32, 0x400
-; HAWAII-NEXT:    v_writelane_b32 v40, s65, 17
+; HAWAII-NEXT:    v_writelane_b32 v40, s31, 17
 ; HAWAII-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; HAWAII-NEXT:    v_readfirstlane_b32 s16, v0
 ; HAWAII-NEXT:    v_readfirstlane_b32 s17, v1
@@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    s_cbranch_execnz .LBB18_1
 ; HAWAII-NEXT:  ; %bb.2:
 ; HAWAII-NEXT:    s_mov_b64 exec, s[54:55]
+; HAWAII-NEXT:    v_readlane_b32 s30, v40, 16
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, v4
-; HAWAII-NEXT:    v_readlane_b32 s65, v40, 17
-; HAWAII-NEXT:    v_readlane_b32 s64, v40, 16
-; HAWAII-NEXT:    v_readlane_b32 s55, v40, 15
-; HAWAII-NEXT:    v_readlane_b32 s54, v40, 14
-; HAWAII-NEXT:    v_readlane_b32 s53, v40, 13
-; HAWAII-NEXT:    v_readlane_b32 s52, v40, 12
-; HAWAII-NEXT:    v_readlane_b32 s51, v40, 11
-; HAWAII-NEXT:    v_readlane_b32 s50, v40, 10
-; HAWAII-NEXT:    v_readlane_b32 s49, v40, 9
-; HAWAII-NEXT:    v_readlane_b32 s48, v40, 8
-; HAWAII-NEXT:    v_readlane_b32 s39, v40, 7
-; HAWAII-NEXT:    v_readlane_b32 s38, v40, 6
-; HAWAII-NEXT:    v_readlane_b32 s37, v40, 5
-; HAWAII-NEXT:    v_readlane_b32 s36, v40, 4
-; HAWAII-NEXT:    v_readlane_b32 s35, v40, 3
-; HAWAII-NEXT:    v_readlane_b32 s34, v40, 2
-; HAWAII-NEXT:    v_readlane_b32 s31, v40, 1
-; HAWAII-NEXT:    v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT:    v_readlane_b32 s31, v40, 17
+; HAWAII-NEXT:    v_readlane_b32 s65, v40, 15
+; HAWAII-NEXT:    v_readlane_b32 s64, v40, 14
+; HAWAII-NEXT:    v_readlane_b32 s55, v40, 13
+; HAWAII-NEXT:    v_readlane_b32 s54, v40, 12
+; HAWAII-NEXT:    v_readlane_b32 s53, v40, 11
+; HAWAII-NEXT:    v_readlane_b32 s52, v40, 10
+; HAWAII-NEXT:    v_readlane_b32 s51, v40, 9
+; HAWAII-NEXT:    v_readlane_b32 s50, v40, 8
+; HAWAII-NEXT:    v_readlane_b32 s49, v40, 7
+; HAWAII-NEXT:    v_readlane_b32 s48, v40, 6
+; HAWAII-NEXT:    v_readlane_b32 s39, v40, 5
+; HAWAII-NEXT:    v_readlane_b32 s38, v40, 4
+; HAWAII-NEXT:    v_readlane_b32 s37, v40, 3
+; HAWAII-NEXT:    v_readlane_b32 s36, v40, 2
+; HAWAII-NEXT:    v_readlane_b32 s35, v40, 1
+; HAWAII-NEXT:    v_readlane_b32 s34, v40, 0
 ; HAWAII-NEXT:    s_mov_b32 s32, s33
 ; HAWAII-NEXT:    v_readlane_b32 s4, v40, 18
 ; HAWAII-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s64, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX9-NEXT:    s_mov_b32 s50, s15
 ; GFX9-NEXT:    s_mov_b32 s51, s14
 ; GFX9-NEXT:    s_mov_b32 s52, s13
@@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    s_mov_b64 s[54:55], exec
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s65, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
@@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX9-NEXT:  ; %bb.2:
 ; GFX9-NEXT:    s_mov_b64 exec, s[54:55]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_readlane_b32 s65, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s65, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 18
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 52d9a30ffb8ba..1c9ee01807a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index d2394bab82c77..06f1c08d1f04e 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1283,11 +1283,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s19
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
 ; WAVE32-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-OPT-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1318,11 +1318,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s19
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
 ; WAVE64-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-OPT-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1431,8 +1431,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-NEXT:    ; use s5
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1542,8 +1542,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    ; use s5
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1653,8 +1653,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 5c83491e7289e..4232a09d9fc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -194,8 +194,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -230,8 +230,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -266,8 +266,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -309,8 +309,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -381,8 +381,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v4
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -468,8 +468,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v8
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -518,6 +518,7 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -527,7 +528,6 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 5c6fcd4f977e3..13cde61ff16a0 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -18,11 +18,12 @@ define void @test_load_zext() #0 {
 ; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    s_mov_b32 s0, DescriptorBuffer at abs32@lo
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index d76b2e5f2358d..837b66dadbba5 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -26,8 +26,8 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-NEXT:    s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
 ; CHECK-NEXT:     ; illegal copy v0 to s0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -62,8 +62,8 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-NEXT:     ; illegal copy v0 to s0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e78d62561238b..e5215fe1acdef 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_or_saveexec_b32 s16, -1
 ; GCN-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b32 exec_lo, s16
-; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_add_i32 s32, s32, 16
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_load_b64 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
@@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 25e8581fb6cdd..639dcdcbf1c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -14,22 +14,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_writelane_b32 v41, s16, 16
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -93,22 +93,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v41, 16
@@ -266,32 +266,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v45, s30, 0
-; GCN-NEXT:    v_writelane_b32 v45, s31, 1
-; GCN-NEXT:    v_writelane_b32 v45, s34, 2
-; GCN-NEXT:    v_writelane_b32 v45, s35, 3
-; GCN-NEXT:    v_writelane_b32 v45, s36, 4
-; GCN-NEXT:    v_writelane_b32 v45, s37, 5
-; GCN-NEXT:    v_writelane_b32 v45, s38, 6
-; GCN-NEXT:    v_writelane_b32 v45, s39, 7
-; GCN-NEXT:    v_writelane_b32 v45, s48, 8
-; GCN-NEXT:    v_writelane_b32 v45, s49, 9
-; GCN-NEXT:    v_writelane_b32 v45, s50, 10
-; GCN-NEXT:    v_writelane_b32 v45, s51, 11
-; GCN-NEXT:    v_writelane_b32 v45, s52, 12
-; GCN-NEXT:    v_writelane_b32 v45, s53, 13
-; GCN-NEXT:    v_writelane_b32 v45, s54, 14
-; GCN-NEXT:    v_writelane_b32 v45, s55, 15
-; GCN-NEXT:    v_writelane_b32 v45, s64, 16
-; GCN-NEXT:    v_writelane_b32 v45, s65, 17
-; GCN-NEXT:    v_writelane_b32 v45, s66, 18
-; GCN-NEXT:    v_writelane_b32 v45, s67, 19
-; GCN-NEXT:    v_writelane_b32 v45, s68, 20
-; GCN-NEXT:    v_writelane_b32 v45, s69, 21
-; GCN-NEXT:    v_writelane_b32 v45, s70, 22
-; GCN-NEXT:    v_writelane_b32 v45, s71, 23
-; GCN-NEXT:    v_writelane_b32 v45, s80, 24
-; GCN-NEXT:    v_writelane_b32 v45, s81, 25
+; GCN-NEXT:    v_writelane_b32 v45, s34, 0
+; GCN-NEXT:    v_writelane_b32 v45, s35, 1
+; GCN-NEXT:    v_writelane_b32 v45, s36, 2
+; GCN-NEXT:    v_writelane_b32 v45, s37, 3
+; GCN-NEXT:    v_writelane_b32 v45, s38, 4
+; GCN-NEXT:    v_writelane_b32 v45, s39, 5
+; GCN-NEXT:    v_writelane_b32 v45, s48, 6
+; GCN-NEXT:    v_writelane_b32 v45, s49, 7
+; GCN-NEXT:    v_writelane_b32 v45, s50, 8
+; GCN-NEXT:    v_writelane_b32 v45, s51, 9
+; GCN-NEXT:    v_writelane_b32 v45, s52, 10
+; GCN-NEXT:    v_writelane_b32 v45, s53, 11
+; GCN-NEXT:    v_writelane_b32 v45, s54, 12
+; GCN-NEXT:    v_writelane_b32 v45, s55, 13
+; GCN-NEXT:    v_writelane_b32 v45, s64, 14
+; GCN-NEXT:    v_writelane_b32 v45, s65, 15
+; GCN-NEXT:    v_writelane_b32 v45, s66, 16
+; GCN-NEXT:    v_writelane_b32 v45, s67, 17
+; GCN-NEXT:    v_writelane_b32 v45, s68, 18
+; GCN-NEXT:    v_writelane_b32 v45, s69, 19
+; GCN-NEXT:    v_writelane_b32 v45, s70, 20
+; GCN-NEXT:    v_writelane_b32 v45, s71, 21
+; GCN-NEXT:    v_writelane_b32 v45, s80, 22
+; GCN-NEXT:    v_writelane_b32 v45, s81, 23
+; GCN-NEXT:    v_writelane_b32 v45, s30, 24
+; GCN-NEXT:    v_writelane_b32 v45, s31, 25
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_mov_b32 s54, s15
 ; GCN-NEXT:    s_mov_b32 s55, s14
@@ -427,32 +427,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[66:67]
-; GCN-NEXT:    v_readlane_b32 s81, v45, 25
-; GCN-NEXT:    v_readlane_b32 s80, v45, 24
-; GCN-NEXT:    v_readlane_b32 s71, v45, 23
-; GCN-NEXT:    v_readlane_b32 s70, v45, 22
-; GCN-NEXT:    v_readlane_b32 s69, v45, 21
-; GCN-NEXT:    v_readlane_b32 s68, v45, 20
-; GCN-NEXT:    v_readlane_b32 s67, v45, 19
-; GCN-NEXT:    v_readlane_b32 s66, v45, 18
-; GCN-NEXT:    v_readlane_b32 s65, v45, 17
-; GCN-NEXT:    v_readlane_b32 s64, v45, 16
-; GCN-NEXT:    v_readlane_b32 s55, v45, 15
-; GCN-NEXT:    v_readlane_b32 s54, v45, 14
-; GCN-NEXT:    v_readlane_b32 s53, v45, 13
-; GCN-NEXT:    v_readlane_b32 s52, v45, 12
-; GCN-NEXT:    v_readlane_b32 s51, v45, 11
-; GCN-NEXT:    v_readlane_b32 s50, v45, 10
-; GCN-NEXT:    v_readlane_b32 s49, v45, 9
-; GCN-NEXT:    v_readlane_b32 s48, v45, 8
-; GCN-NEXT:    v_readlane_b32 s39, v45, 7
-; GCN-NEXT:    v_readlane_b32 s38, v45, 6
-; GCN-NEXT:    v_readlane_b32 s37, v45, 5
-; GCN-NEXT:    v_readlane_b32 s36, v45, 4
-; GCN-NEXT:    v_readlane_b32 s35, v45, 3
-; GCN-NEXT:    v_readlane_b32 s34, v45, 2
-; GCN-NEXT:    v_readlane_b32 s31, v45, 1
-; GCN-NEXT:    v_readlane_b32 s30, v45, 0
+; GCN-NEXT:    v_readlane_b32 s30, v45, 24
+; GCN-NEXT:    v_readlane_b32 s31, v45, 25
+; GCN-NEXT:    v_readlane_b32 s81, v45, 23
+; GCN-NEXT:    v_readlane_b32 s80, v45, 22
+; GCN-NEXT:    v_readlane_b32 s71, v45, 21
+; GCN-NEXT:    v_readlane_b32 s70, v45, 20
+; GCN-NEXT:    v_readlane_b32 s69, v45, 19
+; GCN-NEXT:    v_readlane_b32 s68, v45, 18
+; GCN-NEXT:    v_readlane_b32 s67, v45, 17
+; GCN-NEXT:    v_readlane_b32 s66, v45, 16
+; GCN-NEXT:    v_readlane_b32 s65, v45, 15
+; GCN-NEXT:    v_readlane_b32 s64, v45, 14
+; GCN-NEXT:    v_readlane_b32 s55, v45, 13
+; GCN-NEXT:    v_readlane_b32 s54, v45, 12
+; GCN-NEXT:    v_readlane_b32 s53, v45, 11
+; GCN-NEXT:    v_readlane_b32 s52, v45, 10
+; GCN-NEXT:    v_readlane_b32 s51, v45, 9
+; GCN-NEXT:    v_readlane_b32 s50, v45, 8
+; GCN-NEXT:    v_readlane_b32 s49, v45, 7
+; GCN-NEXT:    v_readlane_b32 s48, v45, 6
+; GCN-NEXT:    v_readlane_b32 s39, v45, 5
+; GCN-NEXT:    v_readlane_b32 s38, v45, 4
+; GCN-NEXT:    v_readlane_b32 s37, v45, 3
+; GCN-NEXT:    v_readlane_b32 s36, v45, 2
+; GCN-NEXT:    v_readlane_b32 s35, v45, 1
+; GCN-NEXT:    v_readlane_b32 s34, v45, 0
 ; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 9ce8859c410fc..c902ac6173935 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 1805add14e47a..a76f4495f85d5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3086,8 +3086,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1032-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    s_mov_b32 s32, s33
 ; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
@@ -3117,8 +3117,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1064-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    s_mov_b32 s32, s33
 ; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 50195bc5bf414..087eecbfda19e 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    s_wait_alu 0xfffe
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    s_wait_alu 0xfffe
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -8049,8 +8049,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
-; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
+; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8390,8 +8390,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    s_wait_alu 0xfffe
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    flat_store_b32 v[40:41], v0
-; GISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
+; GISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; GISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8733,8 +8733,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL64-NEXT:    flat_store_b32 v[40:41], v0
-; DAGISEL64-NEXT:    v_readlane_b32 s31, v42, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s30, v42, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v42, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -9077,8 +9077,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    s_wait_alu 0xfffe
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL64-NEXT:    flat_store_b32 v[40:41], v0
-; GISEL64-NEXT:    v_readlane_b32 s31, v42, 3
 ; GISEL64-NEXT:    v_readlane_b32 s30, v42, 2
+; GISEL64-NEXT:    v_readlane_b32 s31, v42, 3
 ; GISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -10198,8 +10198,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 06c451869e841..3fe54cd045c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a32
 ; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    v_readlane_b32 s20, v39, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s20
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 9e9fe1809c780..b5c0023bd5c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -49,10 +49,10 @@ define void @test() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v39, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
@@ -111,8 +111,8 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    s_mov_b32 s32, s33
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e9a0671ead4e0..fe641367944be 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -622,8 +622,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -683,9 +683,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload



More information about the llvm-branch-commits mailing list