[llvm-branch-commits] [llvm] Use register pair for PC spill (PR #169098)
Scott Linder via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 24 10:18:30 PST 2025
https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/169098
>From b46525ba7f2e122695a399d6a8dd049ae3295cef Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder at amd.com>
Date: Wed, 29 Oct 2025 18:46:12 +0000
Subject: [PATCH] Use register pair for PC spill
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 20 +
.../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 2 +-
.../GlobalISel/call-outgoing-stack-args.ll | 8 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 +-
.../abi-attribute-hints-undefined-behavior.ll | 2 +-
.../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll | 8 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 44 +-
.../amdgpu-simplify-libcall-pow-codegen.ll | 280 +--
...tor-flatscratchinit-undefined-behavior2.ll | 13 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 166 +-
.../test/CodeGen/AMDGPU/branch-relax-spill.ll | 156 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 40 +-
.../AMDGPU/call-graph-register-usage.ll | 2 +-
.../AMDGPU/call-preserved-registers.ll | 116 +-
.../test/CodeGen/AMDGPU/callee-frame-setup.ll | 106 +-
.../callee-special-input-vgprs-packed.ll | 14 +-
.../AMDGPU/callee-special-input-vgprs.ll | 14 +-
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 8 +-
llvm/test/CodeGen/AMDGPU/debug-frame.ll | 8 +-
.../AMDGPU/dwarf-multi-register-use-crash.ll | 64 +-
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 4 +-
.../fix-frame-reg-in-custom-csr-spills.ll | 2 +-
...frame-setup-without-sgpr-to-vgpr-spills.ll | 25 +-
.../CodeGen/AMDGPU/function-args-inreg.ll | 8 +-
.../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 144 +-
.../AMDGPU/gfx-callable-argument-types.ll | 1234 +++++++------
.../gfx-callable-preserved-registers.ll | 72 +-
llvm/test/CodeGen/AMDGPU/global-alias.ll | 2 +-
.../identical-subrange-spill-infloop.ll | 92 +-
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 1104 +++++------
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 2 +-
.../CodeGen/AMDGPU/insert-waitcnts-crash.ll | 12 +-
.../AMDGPU/materialize-frame-index-sgpr.ll | 1634 +++++++++--------
.../CodeGen/AMDGPU/mul24-pass-ordering.ll | 20 +-
.../AMDGPU/need-fp-from-vgpr-spills.ll | 6 +-
llvm/test/CodeGen/AMDGPU/nested-calls.ll | 4 +-
.../AMDGPU/no-source-locations-in-prologue.ll | 2 +-
llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 6 +-
.../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 190 +-
.../AMDGPU/sgpr-spills-split-regalloc.ll | 27 +-
.../AMDGPU/shufflevector.v2i64.v8i64.ll | 299 +--
.../si-lower-sgpr-spills-vgpr-lanes-usage.mir | 18 +-
llvm/test/CodeGen/AMDGPU/sibling-call.ll | 222 +--
llvm/test/CodeGen/AMDGPU/stack-realign.ll | 2 +-
.../CodeGen/AMDGPU/stacksave_stackrestore.ll | 10 +-
.../AMDGPU/strictfp_f16_abi_promote.ll | 14 +-
.../CodeGen/AMDGPU/swdev504645-global-fold.ll | 3 +-
.../AMDGPU/tail-call-inreg-arguments.error.ll | 4 +-
...unfold-masked-merge-scalar-variablemask.ll | 38 +-
.../AMDGPU/unstructured-cfg-def-use-issue.ll | 168 +-
.../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 12 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +-
.../CodeGen/AMDGPU/whole-wave-functions.ll | 20 +-
.../AMDGPU/whole-wave-register-copy.ll | 2 +-
.../AMDGPU/whole-wave-register-spill.ll | 4 +-
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 8 +-
56 files changed, 3262 insertions(+), 3229 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index cbd08f0fb5dff..5161a75097aeb 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -267,11 +267,19 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
std::vector<CalleeSavedInfo> CSI;
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ Register RetAddrReg = TRI->getReturnAddressReg(MF);
+ bool SpillRetAddrReg = false;
for (unsigned I = 0; CSRegs[I]; ++I) {
MCRegister Reg = CSRegs[I];
if (SavedRegs.test(Reg)) {
+ if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) ||
+ Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) {
+ SpillRetAddrReg = true;
+ continue;
+ }
+
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
@@ -282,6 +290,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
}
+ // Return address uses a register pair. Add the super register to the
+ // CSI list so that it's easier to identify the entire spill and CFI
+ // can be emitted appropriately.
+ if (SpillRetAddrReg) {
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+ int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlign(*RC), true);
+ CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+ CalleeSavedFIs.push_back(JunkFI);
+ }
+
if (!CSI.empty()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks)
insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index b84b31cd2702c..023398377de94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
-; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7e6f500181ec6..2c1beb8468576 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 72766f47030cc..35591cd602992 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..0e24430e7be20 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: s_mov_b32 s32, s33
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index 149b0cb4e052d..b6e65c8842904 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; DAGISEL-NEXT: s_clause 0x1
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: scratch_load_b32 v41, off, s33
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
+; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 6fae7fdbbf9bb..1a6e0e27812fe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -16525,18 +16525,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s30, 0
-; SI-NEXT: v_writelane_b32 v8, s31, 1
-; SI-NEXT: v_writelane_b32 v8, s34, 2
-; SI-NEXT: v_writelane_b32 v8, s35, 3
-; SI-NEXT: v_writelane_b32 v8, s36, 4
-; SI-NEXT: v_writelane_b32 v8, s37, 5
-; SI-NEXT: v_writelane_b32 v8, s38, 6
-; SI-NEXT: v_writelane_b32 v8, s39, 7
-; SI-NEXT: v_writelane_b32 v8, s48, 8
-; SI-NEXT: v_writelane_b32 v8, s49, 9
+; SI-NEXT: v_writelane_b32 v8, s34, 0
+; SI-NEXT: v_writelane_b32 v8, s35, 1
+; SI-NEXT: v_writelane_b32 v8, s36, 2
+; SI-NEXT: v_writelane_b32 v8, s37, 3
+; SI-NEXT: v_writelane_b32 v8, s38, 4
+; SI-NEXT: v_writelane_b32 v8, s39, 5
+; SI-NEXT: v_writelane_b32 v8, s48, 6
+; SI-NEXT: v_writelane_b32 v8, s49, 7
+; SI-NEXT: v_writelane_b32 v8, s50, 8
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT: v_writelane_b32 v8, s50, 10
+; SI-NEXT: v_writelane_b32 v8, s30, 9
; SI-NEXT: v_readfirstlane_b32 s39, v6
; SI-NEXT: v_readfirstlane_b32 s48, v5
; SI-NEXT: v_readfirstlane_b32 s49, v4
@@ -16544,6 +16543,7 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s35, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s38, v1
+; SI-NEXT: v_writelane_b32 v8, s31, 10
; SI-NEXT: s_cbranch_scc0 .LBB49_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -16815,18 +16815,18 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v8, 9
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s50, v8, 10
-; SI-NEXT: v_readlane_b32 s49, v8, 9
-; SI-NEXT: v_readlane_b32 s48, v8, 8
-; SI-NEXT: v_readlane_b32 s39, v8, 7
-; SI-NEXT: v_readlane_b32 s38, v8, 6
-; SI-NEXT: v_readlane_b32 s37, v8, 5
-; SI-NEXT: v_readlane_b32 s36, v8, 4
-; SI-NEXT: v_readlane_b32 s35, v8, 3
-; SI-NEXT: v_readlane_b32 s34, v8, 2
-; SI-NEXT: v_readlane_b32 s31, v8, 1
-; SI-NEXT: v_readlane_b32 s30, v8, 0
+; SI-NEXT: v_readlane_b32 s31, v8, 10
+; SI-NEXT: v_readlane_b32 s50, v8, 8
+; SI-NEXT: v_readlane_b32 s49, v8, 7
+; SI-NEXT: v_readlane_b32 s48, v8, 6
+; SI-NEXT: v_readlane_b32 s39, v8, 5
+; SI-NEXT: v_readlane_b32 s38, v8, 4
+; SI-NEXT: v_readlane_b32 s37, v8, 3
+; SI-NEXT: v_readlane_b32 s36, v8, 2
+; SI-NEXT: v_readlane_b32 s35, v8, 1
+; SI-NEXT: v_readlane_b32 s34, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 0329f23ea434f..6bbfcc9f52d95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -118,32 +118,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -177,21 +177,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -258,30 +258,30 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v42, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v3
; CHECK-NEXT: v_mov_b32_e32 v40, v2
@@ -313,20 +313,20 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -400,32 +400,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -459,21 +459,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -542,30 +542,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v42, s16, 14
-; CHECK-NEXT: v_writelane_b32 v42, s30, 0
-; CHECK-NEXT: v_writelane_b32 v42, s31, 1
-; CHECK-NEXT: v_writelane_b32 v42, s34, 2
-; CHECK-NEXT: v_writelane_b32 v42, s35, 3
-; CHECK-NEXT: v_writelane_b32 v42, s36, 4
-; CHECK-NEXT: v_writelane_b32 v42, s37, 5
-; CHECK-NEXT: v_writelane_b32 v42, s38, 6
-; CHECK-NEXT: v_writelane_b32 v42, s39, 7
+; CHECK-NEXT: v_writelane_b32 v42, s34, 0
+; CHECK-NEXT: v_writelane_b32 v42, s35, 1
+; CHECK-NEXT: v_writelane_b32 v42, s36, 2
+; CHECK-NEXT: v_writelane_b32 v42, s37, 3
+; CHECK-NEXT: v_writelane_b32 v42, s38, 4
+; CHECK-NEXT: v_writelane_b32 v42, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v42, s48, 8
-; CHECK-NEXT: v_writelane_b32 v42, s49, 9
+; CHECK-NEXT: v_writelane_b32 v42, s48, 6
+; CHECK-NEXT: v_writelane_b32 v42, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v42, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v42, s50, 10
-; CHECK-NEXT: v_writelane_b32 v42, s51, 11
-; CHECK-NEXT: v_writelane_b32 v42, s52, 12
+; CHECK-NEXT: v_writelane_b32 v42, s51, 9
+; CHECK-NEXT: v_writelane_b32 v42, s52, 10
+; CHECK-NEXT: v_writelane_b32 v42, s53, 11
+; CHECK-NEXT: v_writelane_b32 v42, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v42, s53, 13
+; CHECK-NEXT: v_writelane_b32 v42, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -596,20 +596,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s53, v42, 13
-; CHECK-NEXT: v_readlane_b32 s52, v42, 12
-; CHECK-NEXT: v_readlane_b32 s51, v42, 11
-; CHECK-NEXT: v_readlane_b32 s50, v42, 10
-; CHECK-NEXT: v_readlane_b32 s49, v42, 9
-; CHECK-NEXT: v_readlane_b32 s48, v42, 8
-; CHECK-NEXT: v_readlane_b32 s39, v42, 7
-; CHECK-NEXT: v_readlane_b32 s38, v42, 6
-; CHECK-NEXT: v_readlane_b32 s37, v42, 5
-; CHECK-NEXT: v_readlane_b32 s36, v42, 4
-; CHECK-NEXT: v_readlane_b32 s35, v42, 3
-; CHECK-NEXT: v_readlane_b32 s34, v42, 2
-; CHECK-NEXT: v_readlane_b32 s31, v42, 1
-; CHECK-NEXT: v_readlane_b32 s30, v42, 0
+; CHECK-NEXT: v_readlane_b32 s30, v42, 12
+; CHECK-NEXT: v_readlane_b32 s31, v42, 13
+; CHECK-NEXT: v_readlane_b32 s53, v42, 11
+; CHECK-NEXT: v_readlane_b32 s52, v42, 10
+; CHECK-NEXT: v_readlane_b32 s51, v42, 9
+; CHECK-NEXT: v_readlane_b32 s50, v42, 8
+; CHECK-NEXT: v_readlane_b32 s49, v42, 7
+; CHECK-NEXT: v_readlane_b32 s48, v42, 6
+; CHECK-NEXT: v_readlane_b32 s39, v42, 5
+; CHECK-NEXT: v_readlane_b32 s38, v42, 4
+; CHECK-NEXT: v_readlane_b32 s37, v42, 3
+; CHECK-NEXT: v_readlane_b32 s36, v42, 2
+; CHECK-NEXT: v_readlane_b32 s35, v42, 1
+; CHECK-NEXT: v_readlane_b32 s34, v42, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v42, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -683,32 +683,32 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v41, v1
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -741,21 +741,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 583b6fe0a81ca..3bed751c979c3 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8
@@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT: s_nop 1
; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ab9cd8e037734..7d1e47ab8acb0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4395,8 +4395,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
+; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4424,10 +4424,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
-; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4453,10 +4453,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4482,10 +4482,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v2, 1
-; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4508,13 +4508,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v4, 1
-; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -4541,10 +4542,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4571,10 +4572,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
-; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -4601,10 +4603,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -4648,8 +4651,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
+; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4679,13 +4682,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2
+; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
-; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4711,10 +4714,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4740,10 +4743,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v2, 1
-; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4766,13 +4769,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v4, 1
-; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -4799,10 +4803,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4829,10 +4833,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
-; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -4859,10 +4864,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -4908,8 +4914,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: v_readlane_b32 s30, v5, 0
+; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4942,12 +4948,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3
+; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
-; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4974,12 +4980,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
-; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5005,12 +5011,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v3, 1
-; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5033,16 +5039,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5069,12 +5076,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5101,12 +5108,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
-; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -5134,12 +5142,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -5193,8 +5202,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: v_readlane_b32 s30, v8, 0
+; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5234,13 +5243,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4
+; GFX7-NEXT: v_readlane_b32 s30, v6, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v6, 1
-; GFX7-NEXT: v_readlane_b32 s30, v6, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5267,12 +5276,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
-; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5298,12 +5307,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v3, 1
-; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5326,14 +5335,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5360,12 +5370,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5392,10 +5402,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
-; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -5423,10 +5434,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -5500,8 +5512,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: v_readlane_b32 s30, v16, 0
+; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5561,13 +5573,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8
+; GFX7-NEXT: v_readlane_b32 s30, v10, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v10, 1
-; GFX7-NEXT: v_readlane_b32 s30, v10, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5600,12 +5612,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT: v_readlane_b32 s30, v6, 0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v6, 1
-; GFX8-NEXT: v_readlane_b32 s30, v6, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5631,6 +5643,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v5, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v5, 0
; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -5640,7 +5653,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v5, 1
-; GFX900-NEXT: v_readlane_b32 s30, v5, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5663,13 +5675,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -5696,6 +5709,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -5705,7 +5719,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
-; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5732,10 +5745,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
-; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
@@ -5762,10 +5776,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -5879,8 +5894,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v20, 1
; GCN-NEXT: v_readlane_b32 s30, v20, 0
+; GCN-NEXT: v_readlane_b32 s31, v20, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -5980,13 +5995,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16
+; GFX7-NEXT: v_readlane_b32 s30, v18, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v18, 1
-; GFX7-NEXT: v_readlane_b32 s30, v18, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6031,12 +6046,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT: v_readlane_b32 s30, v10, 0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v10, 1
-; GFX8-NEXT: v_readlane_b32 s30, v10, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6062,6 +6077,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v9, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v9, 0
; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -6079,7 +6095,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v9, 1
-; GFX900-NEXT: v_readlane_b32 s30, v9, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6102,15 +6117,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v9, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v9, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v9, 0
; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v9, 1
-; GFX950-NEXT: v_readlane_b32 s30, v9, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
@@ -6137,6 +6153,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -6154,7 +6171,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
-; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -6181,12 +6197,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v9, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v9, 1
-; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
@@ -6213,12 +6230,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v9, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v9, 0
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v9, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -48879,34 +48897,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT: v_writelane_b32 v34, s34, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT: v_writelane_b32 v34, s35, 1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
+; GFX8-NEXT: v_writelane_b32 v34, s36, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
+; GFX8-NEXT: v_writelane_b32 v34, s37, 3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT: v_writelane_b32 v34, s30, 0
+; GFX8-NEXT: v_writelane_b32 v34, s38, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT: v_writelane_b32 v34, s31, 1
+; GFX8-NEXT: v_writelane_b32 v34, s39, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT: v_writelane_b32 v34, s34, 2
+; GFX8-NEXT: v_writelane_b32 v34, s30, 6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT: v_writelane_b32 v34, s35, 3
+; GFX8-NEXT: v_writelane_b32 v34, s31, 7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT: v_writelane_b32 v34, s36, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT: v_writelane_b32 v34, s37, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT: v_writelane_b32 v34, s38, 6
-; GFX8-NEXT: v_writelane_b32 v34, s39, 7
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
@@ -49032,6 +49050,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT: v_readlane_b32 s30, v34, 6
; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -49040,14 +49059,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_readlane_b32 s39, v34, 7
-; GFX8-NEXT: v_readlane_b32 s38, v34, 6
-; GFX8-NEXT: v_readlane_b32 s37, v34, 5
-; GFX8-NEXT: v_readlane_b32 s36, v34, 4
-; GFX8-NEXT: v_readlane_b32 s35, v34, 3
-; GFX8-NEXT: v_readlane_b32 s34, v34, 2
-; GFX8-NEXT: v_readlane_b32 s31, v34, 1
-; GFX8-NEXT: v_readlane_b32 s30, v34, 0
+; GFX8-NEXT: v_readlane_b32 s31, v34, 7
+; GFX8-NEXT: v_readlane_b32 s39, v34, 5
+; GFX8-NEXT: v_readlane_b32 s38, v34, 4
+; GFX8-NEXT: v_readlane_b32 s37, v34, 3
+; GFX8-NEXT: v_readlane_b32 s36, v34, 2
+; GFX8-NEXT: v_readlane_b32 s35, v34, 1
+; GFX8-NEXT: v_readlane_b32 s34, v34, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -49119,11 +49137,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_and_b32_e32 v0, 1, v28
; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT: v_writelane_b32 v33, s30, 0
-; GFX900-NEXT: v_writelane_b32 v33, s31, 1
-; GFX900-NEXT: v_writelane_b32 v33, s34, 2
+; GFX900-NEXT: v_writelane_b32 v33, s34, 0
+; GFX900-NEXT: v_writelane_b32 v33, s35, 1
+; GFX900-NEXT: v_writelane_b32 v33, s30, 2
+; GFX900-NEXT: v_writelane_b32 v33, s31, 3
; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX900-NEXT: v_writelane_b32 v33, s35, 3
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
@@ -49228,6 +49246,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_readlane_b32 s30, v33, 2
; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4
; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4
@@ -49244,10 +49263,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4
; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4
; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4
-; GFX900-NEXT: v_readlane_b32 s35, v33, 3
-; GFX900-NEXT: v_readlane_b32 s34, v33, 2
-; GFX900-NEXT: v_readlane_b32 s31, v33, 1
-; GFX900-NEXT: v_readlane_b32 s30, v33, 0
+; GFX900-NEXT: v_readlane_b32 s31, v33, 3
+; GFX900-NEXT: v_readlane_b32 s35, v33, 1
+; GFX900-NEXT: v_readlane_b32 s34, v33, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index ab2ad19d0f1bf..fb11d3b7d9d65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt expcnt(0)
-; CHECK-NEXT: v_writelane_b32 v0, s30, 0
-; CHECK-NEXT: v_writelane_b32 v0, s31, 1
-; CHECK-NEXT: v_writelane_b32 v0, s33, 2
-; CHECK-NEXT: v_writelane_b32 v0, s34, 3
-; CHECK-NEXT: v_writelane_b32 v0, s35, 4
-; CHECK-NEXT: v_writelane_b32 v0, s36, 5
-; CHECK-NEXT: v_writelane_b32 v0, s37, 6
-; CHECK-NEXT: v_writelane_b32 v0, s38, 7
-; CHECK-NEXT: v_writelane_b32 v0, s39, 8
-; CHECK-NEXT: v_writelane_b32 v0, s48, 9
-; CHECK-NEXT: v_writelane_b32 v0, s49, 10
-; CHECK-NEXT: v_writelane_b32 v0, s50, 11
-; CHECK-NEXT: v_writelane_b32 v0, s51, 12
-; CHECK-NEXT: v_writelane_b32 v0, s52, 13
-; CHECK-NEXT: v_writelane_b32 v0, s53, 14
-; CHECK-NEXT: v_writelane_b32 v0, s54, 15
-; CHECK-NEXT: v_writelane_b32 v0, s55, 16
-; CHECK-NEXT: v_writelane_b32 v0, s64, 17
-; CHECK-NEXT: v_writelane_b32 v0, s65, 18
-; CHECK-NEXT: v_writelane_b32 v0, s66, 19
-; CHECK-NEXT: v_writelane_b32 v0, s67, 20
-; CHECK-NEXT: v_writelane_b32 v0, s68, 21
-; CHECK-NEXT: v_writelane_b32 v0, s69, 22
-; CHECK-NEXT: v_writelane_b32 v0, s70, 23
-; CHECK-NEXT: v_writelane_b32 v0, s71, 24
-; CHECK-NEXT: v_writelane_b32 v0, s80, 25
-; CHECK-NEXT: v_writelane_b32 v0, s81, 26
-; CHECK-NEXT: v_writelane_b32 v0, s82, 27
-; CHECK-NEXT: v_writelane_b32 v0, s83, 28
-; CHECK-NEXT: v_writelane_b32 v0, s84, 29
-; CHECK-NEXT: v_writelane_b32 v0, s85, 30
-; CHECK-NEXT: v_writelane_b32 v0, s86, 31
-; CHECK-NEXT: v_writelane_b32 v0, s87, 32
-; CHECK-NEXT: v_writelane_b32 v0, s96, 33
-; CHECK-NEXT: v_writelane_b32 v0, s97, 34
-; CHECK-NEXT: v_writelane_b32 v0, s98, 35
-; CHECK-NEXT: v_writelane_b32 v0, s99, 36
+; CHECK-NEXT: v_writelane_b32 v0, s33, 0
+; CHECK-NEXT: v_writelane_b32 v0, s34, 1
+; CHECK-NEXT: v_writelane_b32 v0, s35, 2
+; CHECK-NEXT: v_writelane_b32 v0, s36, 3
+; CHECK-NEXT: v_writelane_b32 v0, s37, 4
+; CHECK-NEXT: v_writelane_b32 v0, s38, 5
+; CHECK-NEXT: v_writelane_b32 v0, s39, 6
+; CHECK-NEXT: v_writelane_b32 v0, s48, 7
+; CHECK-NEXT: v_writelane_b32 v0, s49, 8
+; CHECK-NEXT: v_writelane_b32 v0, s50, 9
+; CHECK-NEXT: v_writelane_b32 v0, s51, 10
+; CHECK-NEXT: v_writelane_b32 v0, s52, 11
+; CHECK-NEXT: v_writelane_b32 v0, s53, 12
+; CHECK-NEXT: v_writelane_b32 v0, s54, 13
+; CHECK-NEXT: v_writelane_b32 v0, s55, 14
+; CHECK-NEXT: v_writelane_b32 v0, s64, 15
+; CHECK-NEXT: v_writelane_b32 v0, s65, 16
+; CHECK-NEXT: v_writelane_b32 v0, s66, 17
+; CHECK-NEXT: v_writelane_b32 v0, s67, 18
+; CHECK-NEXT: v_writelane_b32 v0, s68, 19
+; CHECK-NEXT: v_writelane_b32 v0, s69, 20
+; CHECK-NEXT: v_writelane_b32 v0, s70, 21
+; CHECK-NEXT: v_writelane_b32 v0, s71, 22
+; CHECK-NEXT: v_writelane_b32 v0, s80, 23
+; CHECK-NEXT: v_writelane_b32 v0, s81, 24
+; CHECK-NEXT: v_writelane_b32 v0, s82, 25
+; CHECK-NEXT: v_writelane_b32 v0, s83, 26
+; CHECK-NEXT: v_writelane_b32 v0, s84, 27
+; CHECK-NEXT: v_writelane_b32 v0, s85, 28
+; CHECK-NEXT: v_writelane_b32 v0, s86, 29
+; CHECK-NEXT: v_writelane_b32 v0, s87, 30
+; CHECK-NEXT: v_writelane_b32 v0, s96, 31
+; CHECK-NEXT: v_writelane_b32 v0, s97, 32
+; CHECK-NEXT: v_writelane_b32 v0, s98, 33
+; CHECK-NEXT: v_writelane_b32 v0, s99, 34
+; CHECK-NEXT: v_writelane_b32 v0, s100, 35
+; CHECK-NEXT: v_writelane_b32 v0, s101, 36
; CHECK-NEXT: s_mov_b32 s40, s12
-; CHECK-NEXT: v_writelane_b32 v0, s100, 37
+; CHECK-NEXT: v_writelane_b32 v0, s30, 37
; CHECK-NEXT: s_cmp_eq_u32 s40, 0
-; CHECK-NEXT: v_writelane_b32 v0, s101, 38
+; CHECK-NEXT: v_writelane_b32 v0, s31, 38
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ;;#ASMEND
@@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use s31
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s30, v0, 37
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use s32
; CHECK-NEXT: ;;#ASMEND
@@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use vcc_hi
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s101, v0, 38
-; CHECK-NEXT: v_readlane_b32 s100, v0, 37
-; CHECK-NEXT: v_readlane_b32 s99, v0, 36
-; CHECK-NEXT: v_readlane_b32 s98, v0, 35
-; CHECK-NEXT: v_readlane_b32 s97, v0, 34
-; CHECK-NEXT: v_readlane_b32 s96, v0, 33
-; CHECK-NEXT: v_readlane_b32 s87, v0, 32
-; CHECK-NEXT: v_readlane_b32 s86, v0, 31
-; CHECK-NEXT: v_readlane_b32 s85, v0, 30
-; CHECK-NEXT: v_readlane_b32 s84, v0, 29
-; CHECK-NEXT: v_readlane_b32 s83, v0, 28
-; CHECK-NEXT: v_readlane_b32 s82, v0, 27
-; CHECK-NEXT: v_readlane_b32 s81, v0, 26
-; CHECK-NEXT: v_readlane_b32 s80, v0, 25
-; CHECK-NEXT: v_readlane_b32 s71, v0, 24
-; CHECK-NEXT: v_readlane_b32 s70, v0, 23
-; CHECK-NEXT: v_readlane_b32 s69, v0, 22
-; CHECK-NEXT: v_readlane_b32 s68, v0, 21
-; CHECK-NEXT: v_readlane_b32 s67, v0, 20
-; CHECK-NEXT: v_readlane_b32 s66, v0, 19
-; CHECK-NEXT: v_readlane_b32 s65, v0, 18
-; CHECK-NEXT: v_readlane_b32 s64, v0, 17
-; CHECK-NEXT: v_readlane_b32 s55, v0, 16
-; CHECK-NEXT: v_readlane_b32 s54, v0, 15
-; CHECK-NEXT: v_readlane_b32 s53, v0, 14
-; CHECK-NEXT: v_readlane_b32 s52, v0, 13
-; CHECK-NEXT: v_readlane_b32 s51, v0, 12
-; CHECK-NEXT: v_readlane_b32 s50, v0, 11
-; CHECK-NEXT: v_readlane_b32 s49, v0, 10
-; CHECK-NEXT: v_readlane_b32 s48, v0, 9
-; CHECK-NEXT: v_readlane_b32 s39, v0, 8
-; CHECK-NEXT: v_readlane_b32 s38, v0, 7
-; CHECK-NEXT: v_readlane_b32 s37, v0, 6
-; CHECK-NEXT: v_readlane_b32 s36, v0, 5
-; CHECK-NEXT: v_readlane_b32 s35, v0, 4
-; CHECK-NEXT: v_readlane_b32 s34, v0, 3
-; CHECK-NEXT: v_readlane_b32 s33, v0, 2
-; CHECK-NEXT: v_readlane_b32 s31, v0, 1
-; CHECK-NEXT: v_readlane_b32 s30, v0, 0
+; CHECK-NEXT: v_readlane_b32 s31, v0, 38
+; CHECK-NEXT: v_readlane_b32 s101, v0, 36
+; CHECK-NEXT: v_readlane_b32 s100, v0, 35
+; CHECK-NEXT: v_readlane_b32 s99, v0, 34
+; CHECK-NEXT: v_readlane_b32 s98, v0, 33
+; CHECK-NEXT: v_readlane_b32 s97, v0, 32
+; CHECK-NEXT: v_readlane_b32 s96, v0, 31
+; CHECK-NEXT: v_readlane_b32 s87, v0, 30
+; CHECK-NEXT: v_readlane_b32 s86, v0, 29
+; CHECK-NEXT: v_readlane_b32 s85, v0, 28
+; CHECK-NEXT: v_readlane_b32 s84, v0, 27
+; CHECK-NEXT: v_readlane_b32 s83, v0, 26
+; CHECK-NEXT: v_readlane_b32 s82, v0, 25
+; CHECK-NEXT: v_readlane_b32 s81, v0, 24
+; CHECK-NEXT: v_readlane_b32 s80, v0, 23
+; CHECK-NEXT: v_readlane_b32 s71, v0, 22
+; CHECK-NEXT: v_readlane_b32 s70, v0, 21
+; CHECK-NEXT: v_readlane_b32 s69, v0, 20
+; CHECK-NEXT: v_readlane_b32 s68, v0, 19
+; CHECK-NEXT: v_readlane_b32 s67, v0, 18
+; CHECK-NEXT: v_readlane_b32 s66, v0, 17
+; CHECK-NEXT: v_readlane_b32 s65, v0, 16
+; CHECK-NEXT: v_readlane_b32 s64, v0, 15
+; CHECK-NEXT: v_readlane_b32 s55, v0, 14
+; CHECK-NEXT: v_readlane_b32 s54, v0, 13
+; CHECK-NEXT: v_readlane_b32 s53, v0, 12
+; CHECK-NEXT: v_readlane_b32 s52, v0, 11
+; CHECK-NEXT: v_readlane_b32 s51, v0, 10
+; CHECK-NEXT: v_readlane_b32 s50, v0, 9
+; CHECK-NEXT: v_readlane_b32 s49, v0, 8
+; CHECK-NEXT: v_readlane_b32 s48, v0, 7
+; CHECK-NEXT: v_readlane_b32 s39, v0, 6
+; CHECK-NEXT: v_readlane_b32 s38, v0, 5
+; CHECK-NEXT: v_readlane_b32 s37, v0, 4
+; CHECK-NEXT: v_readlane_b32 s36, v0, 3
+; CHECK-NEXT: v_readlane_b32 s35, v0, 2
+; CHECK-NEXT: v_readlane_b32 s34, v0, 1
+; CHECK-NEXT: v_readlane_b32 s33, v0, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index c407f7645315d..d86fbb9754c02 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7178,8 +7178,8 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 10
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7246,8 +7246,8 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 10
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7314,8 +7314,8 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 10
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7364,8 +7364,8 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7432,8 +7432,8 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 10
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7517,8 +7517,8 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 0x41200000
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7585,8 +7585,8 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 0x41200000
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7653,8 +7653,8 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7707,8 +7707,8 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7775,8 +7775,8 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7868,8 +7868,8 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 6
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7944,8 +7944,8 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 6
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8020,8 +8020,8 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8075,8 +8075,8 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8151,8 +8151,8 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 6
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8240,8 +8240,8 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8316,8 +8316,8 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8392,8 +8392,8 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8450,8 +8450,8 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8526,8 +8526,8 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index b250227735bd3..26727e53d990c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -25,8 +25,8 @@ define void @use_vcc() #1 {
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s4, v40, 2
; GCN: s_mov_b32 s33, s4
; GCN: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index aed1079158154..f9070339093da 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s31, v0, 1
; MUBUF-NEXT: v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v0, 1
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
@@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
@@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 {
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 3
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_mov_b32 s34, s40
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 1
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use s34
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 3
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 {
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_mov_b32 s34, s40
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 1
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s34
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v41, s4, 3
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v41, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v41, s34, 0
+; MUBUF-NEXT: v_writelane_b32 v41, s30, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT: v_writelane_b32 v41, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v41, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
@@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; MUBUF-NEXT: ; use v40
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT: v_readlane_b32 s34, v41, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v41, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v41, 1
+; MUBUF-NEXT: v_readlane_b32 s31, v41, 2
+; MUBUF-NEXT: v_readlane_b32 s34, v41, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v41, 3
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v41, s34, 0
+; FLATSCR-NEXT: v_writelane_b32 v41, s30, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT: v_writelane_b32 v41, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
@@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; FLATSCR-NEXT: ; use v40
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: v_readlane_b32 s34, v41, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v41, 1
+; FLATSCR-NEXT: v_readlane_b32 s31, v41, 2
+; FLATSCR-NEXT: v_readlane_b32 s34, v41, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index e7254eb5c3465..07f58df81c502 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 {
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 {
; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 {
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT: v_writelane_b32 v40, s36, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s37, 3
-; FLATSCR-NEXT: v_writelane_b32 v40, s38, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s39, 5
-; FLATSCR-NEXT: v_writelane_b32 v40, s48, 6
-; FLATSCR-NEXT: v_writelane_b32 v40, s49, 7
-; FLATSCR-NEXT: v_writelane_b32 v40, s50, 8
-; FLATSCR-NEXT: v_writelane_b32 v40, s51, 9
-; FLATSCR-NEXT: v_writelane_b32 v40, s52, 10
-; FLATSCR-NEXT: v_writelane_b32 v40, s53, 11
-; FLATSCR-NEXT: v_writelane_b32 v40, s54, 12
-; FLATSCR-NEXT: v_writelane_b32 v40, s55, 13
-; FLATSCR-NEXT: v_writelane_b32 v40, s64, 14
-; FLATSCR-NEXT: v_writelane_b32 v40, s65, 15
-; FLATSCR-NEXT: v_writelane_b32 v40, s66, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s67, 17
+; FLATSCR-NEXT: v_writelane_b32 v40, s36, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s37, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s38, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s39, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s48, 4
+; FLATSCR-NEXT: v_writelane_b32 v40, s49, 5
+; FLATSCR-NEXT: v_writelane_b32 v40, s50, 6
+; FLATSCR-NEXT: v_writelane_b32 v40, s51, 7
+; FLATSCR-NEXT: v_writelane_b32 v40, s52, 8
+; FLATSCR-NEXT: v_writelane_b32 v40, s53, 9
+; FLATSCR-NEXT: v_writelane_b32 v40, s54, 10
+; FLATSCR-NEXT: v_writelane_b32 v40, s55, 11
+; FLATSCR-NEXT: v_writelane_b32 v40, s64, 12
+; FLATSCR-NEXT: v_writelane_b32 v40, s65, 13
+; FLATSCR-NEXT: v_writelane_b32 v40, s66, 14
+; FLATSCR-NEXT: v_writelane_b32 v40, s67, 15
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 17
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: ;;#ASMSTART
@@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[16:31]
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 16
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[72:79]
; FLATSCR-NEXT: ;;#ASMEND
@@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[0:15]
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s67, v40, 17
-; FLATSCR-NEXT: v_readlane_b32 s66, v40, 16
-; FLATSCR-NEXT: v_readlane_b32 s65, v40, 15
-; FLATSCR-NEXT: v_readlane_b32 s64, v40, 14
-; FLATSCR-NEXT: v_readlane_b32 s55, v40, 13
-; FLATSCR-NEXT: v_readlane_b32 s54, v40, 12
-; FLATSCR-NEXT: v_readlane_b32 s53, v40, 11
-; FLATSCR-NEXT: v_readlane_b32 s52, v40, 10
-; FLATSCR-NEXT: v_readlane_b32 s51, v40, 9
-; FLATSCR-NEXT: v_readlane_b32 s50, v40, 8
-; FLATSCR-NEXT: v_readlane_b32 s49, v40, 7
-; FLATSCR-NEXT: v_readlane_b32 s48, v40, 6
-; FLATSCR-NEXT: v_readlane_b32 s39, v40, 5
-; FLATSCR-NEXT: v_readlane_b32 s38, v40, 4
-; FLATSCR-NEXT: v_readlane_b32 s37, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s36, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 17
+; FLATSCR-NEXT: v_readlane_b32 s67, v40, 15
+; FLATSCR-NEXT: v_readlane_b32 s66, v40, 14
+; FLATSCR-NEXT: v_readlane_b32 s65, v40, 13
+; FLATSCR-NEXT: v_readlane_b32 s64, v40, 12
+; FLATSCR-NEXT: v_readlane_b32 s55, v40, 11
+; FLATSCR-NEXT: v_readlane_b32 s54, v40, 10
+; FLATSCR-NEXT: v_readlane_b32 s53, v40, 9
+; FLATSCR-NEXT: v_readlane_b32 s52, v40, 8
+; FLATSCR-NEXT: v_readlane_b32 s51, v40, 7
+; FLATSCR-NEXT: v_readlane_b32 s50, v40, 6
+; FLATSCR-NEXT: v_readlane_b32 s49, v40, 5
+; FLATSCR-NEXT: v_readlane_b32 s48, v40, 4
+; FLATSCR-NEXT: v_readlane_b32 s39, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s38, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s37, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s36, v40, 0
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
@@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v1, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v1, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved SGPRs
; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved VGPRs
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100
@@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved SGPRs
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved VGPRs
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004
@@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 {
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 5f965ba431ab5..bb5963244da3c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: s_mov_b32 s32, s33
; GFX90A-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v0, 10
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bb2f06bfe83f8..f20be656f3af0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v0, 10
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 {
; GCN-NEXT: v_mov_b32_e32 v0, 9
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 38c20c7cf62d6..0cab17c9bfcfc 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_struct at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 676144e65c10f..7d38cea998256 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 {
; GFX900-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX900-NEXT: v_writelane_b32 v40, s31, 1
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX900-NEXT: v_readlane_b32 s31, v40, 1
; GFX900-NEXT: v_readlane_b32 s30, v40, 0
+; GFX900-NEXT: v_readlane_b32 s31, v40, 1
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: v_readlane_b32 s4, v40, 2
; GFX900-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-DIS-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-DIS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-V2A-DIS-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-EN-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-EN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-EN-NEXT: s_mov_b32 s32, s33
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-V2A-EN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 {
; WAVE32-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; WAVE32-NEXT: v_writelane_b32 v40, s31, 1
; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-NEXT: v_readlane_b32 s31, v40, 1
; WAVE32-NEXT: v_readlane_b32 s30, v40, 0
+; WAVE32-NEXT: v_readlane_b32 s31, v40, 1
; WAVE32-NEXT: s_mov_b32 s32, s33
; WAVE32-NEXT: v_readlane_b32 s4, v40, 2
; WAVE32-NEXT: s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index a0c25b2a0beb3..705d403764503 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v41, s16, 16
; CHECK-NEXT: .cfi_llvm_vector_registers 65, 2601, 16, 32
; CHECK-NEXT: .cfi_def_cfa_register 65
-; CHECK-NEXT: v_writelane_b32 v41, s30, 0
-; CHECK-NEXT: v_writelane_b32 v41, s31, 1
-; CHECK-NEXT: v_writelane_b32 v41, s34, 2
-; CHECK-NEXT: v_writelane_b32 v41, s35, 3
-; CHECK-NEXT: v_writelane_b32 v41, s36, 4
-; CHECK-NEXT: v_writelane_b32 v41, s37, 5
-; CHECK-NEXT: v_writelane_b32 v41, s38, 6
-; CHECK-NEXT: v_writelane_b32 v41, s39, 7
-; CHECK-NEXT: v_writelane_b32 v41, s48, 8
-; CHECK-NEXT: v_writelane_b32 v41, s49, 9
-; CHECK-NEXT: v_writelane_b32 v41, s50, 10
-; CHECK-NEXT: v_writelane_b32 v41, s51, 11
-; CHECK-NEXT: v_writelane_b32 v41, s52, 12
+; CHECK-NEXT: v_writelane_b32 v41, s34, 0
+; CHECK-NEXT: v_writelane_b32 v41, s35, 1
+; CHECK-NEXT: v_writelane_b32 v41, s36, 2
+; CHECK-NEXT: v_writelane_b32 v41, s37, 3
+; CHECK-NEXT: v_writelane_b32 v41, s38, 4
+; CHECK-NEXT: v_writelane_b32 v41, s39, 5
+; CHECK-NEXT: v_writelane_b32 v41, s48, 6
+; CHECK-NEXT: v_writelane_b32 v41, s49, 7
+; CHECK-NEXT: v_writelane_b32 v41, s50, 8
+; CHECK-NEXT: v_writelane_b32 v41, s51, 9
+; CHECK-NEXT: v_writelane_b32 v41, s52, 10
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v41, s53, 13
-; CHECK-NEXT: v_writelane_b32 v41, s54, 14
+; CHECK-NEXT: v_writelane_b32 v41, s53, 11
+; CHECK-NEXT: v_writelane_b32 v41, s54, 12
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef
; CHECK-NEXT: .Ltmp0:
@@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v41, s55, 15
+; CHECK-NEXT: v_writelane_b32 v41, s55, 13
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v41, s30, 14
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: v_writelane_b32 v41, s31, 15
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_readlane_b32 s30, v41, 14
; CHECK-NEXT: flat_store_dword v[0:1], v2
-; CHECK-NEXT: v_readlane_b32 s55, v41, 15
-; CHECK-NEXT: v_readlane_b32 s54, v41, 14
-; CHECK-NEXT: v_readlane_b32 s53, v41, 13
-; CHECK-NEXT: v_readlane_b32 s52, v41, 12
-; CHECK-NEXT: v_readlane_b32 s51, v41, 11
-; CHECK-NEXT: v_readlane_b32 s50, v41, 10
-; CHECK-NEXT: v_readlane_b32 s49, v41, 9
-; CHECK-NEXT: v_readlane_b32 s48, v41, 8
-; CHECK-NEXT: v_readlane_b32 s39, v41, 7
-; CHECK-NEXT: v_readlane_b32 s38, v41, 6
-; CHECK-NEXT: v_readlane_b32 s37, v41, 5
-; CHECK-NEXT: v_readlane_b32 s36, v41, 4
-; CHECK-NEXT: v_readlane_b32 s35, v41, 3
-; CHECK-NEXT: v_readlane_b32 s34, v41, 2
-; CHECK-NEXT: v_readlane_b32 s31, v41, 1
-; CHECK-NEXT: v_readlane_b32 s30, v41, 0
+; CHECK-NEXT: v_readlane_b32 s31, v41, 15
+; CHECK-NEXT: v_readlane_b32 s55, v41, 13
+; CHECK-NEXT: v_readlane_b32 s54, v41, 12
+; CHECK-NEXT: v_readlane_b32 s53, v41, 11
+; CHECK-NEXT: v_readlane_b32 s52, v41, 10
+; CHECK-NEXT: v_readlane_b32 s51, v41, 9
+; CHECK-NEXT: v_readlane_b32 s50, v41, 8
+; CHECK-NEXT: v_readlane_b32 s49, v41, 7
+; CHECK-NEXT: v_readlane_b32 s48, v41, 6
+; CHECK-NEXT: v_readlane_b32 s39, v41, 5
+; CHECK-NEXT: v_readlane_b32 s38, v41, 4
+; CHECK-NEXT: v_readlane_b32 s37, v41, 3
+; CHECK-NEXT: v_readlane_b32 s36, v41, 2
+; CHECK-NEXT: v_readlane_b32 s35, v41, 1
+; CHECK-NEXT: v_readlane_b32 s34, v41, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v41, 16
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index bcccf50e3805c..db48100aa7caf 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-TRUE16-NEXT: s_wait_alu 0xfffe
; CHECK-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-TRUE16-NEXT: s_mov_b32 s32, s33
; CHECK-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-FAKE16-NEXT: s_wait_alu 0xfffe
; CHECK-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-FAKE16-NEXT: s_mov_b32 s32, s33
; CHECK-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 76a2114a000cf..cba5aa8ef3672 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: v_readlane_b32 s30, v42, 0
+; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s4, v42, 2
; GCN-NEXT: v_readlane_b32 s34, v42, 3
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 2e88da142bb41..6abe5998d6767 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 1
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 0
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 0
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 1
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 0f535c3a3bdbc..6dd683e6fca53 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1740,8 +1740,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1770,8 +1770,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2159,8 +2159,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2201,8 +2201,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 9d137fb4101e4..a2f203a111e18 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: v_writelane_b32 v40, s27, 23
; SDAG-NEXT: v_writelane_b32 v40, s28, 24
; SDAG-NEXT: v_writelane_b32 v40, s29, 25
-; SDAG-NEXT: v_writelane_b32 v40, s30, 26
-; SDAG-NEXT: v_writelane_b32 v40, s31, 27
-; SDAG-NEXT: v_writelane_b32 v40, s72, 28
-; SDAG-NEXT: v_writelane_b32 v40, s73, 29
-; SDAG-NEXT: v_writelane_b32 v40, s74, 30
-; SDAG-NEXT: v_writelane_b32 v40, s75, 31
-; SDAG-NEXT: v_writelane_b32 v40, s76, 32
-; SDAG-NEXT: v_writelane_b32 v40, s77, 33
-; SDAG-NEXT: v_writelane_b32 v40, s78, 34
-; SDAG-NEXT: v_writelane_b32 v40, s79, 35
-; SDAG-NEXT: v_writelane_b32 v40, s88, 36
-; SDAG-NEXT: v_writelane_b32 v40, s89, 37
-; SDAG-NEXT: v_writelane_b32 v40, s90, 38
-; SDAG-NEXT: v_writelane_b32 v40, s91, 39
-; SDAG-NEXT: v_writelane_b32 v40, s92, 40
-; SDAG-NEXT: v_writelane_b32 v40, s93, 41
-; SDAG-NEXT: v_writelane_b32 v40, s94, 42
+; SDAG-NEXT: v_writelane_b32 v40, s72, 26
+; SDAG-NEXT: v_writelane_b32 v40, s73, 27
+; SDAG-NEXT: v_writelane_b32 v40, s74, 28
+; SDAG-NEXT: v_writelane_b32 v40, s75, 29
+; SDAG-NEXT: v_writelane_b32 v40, s76, 30
+; SDAG-NEXT: v_writelane_b32 v40, s77, 31
+; SDAG-NEXT: v_writelane_b32 v40, s78, 32
+; SDAG-NEXT: v_writelane_b32 v40, s79, 33
+; SDAG-NEXT: v_writelane_b32 v40, s88, 34
+; SDAG-NEXT: v_writelane_b32 v40, s89, 35
+; SDAG-NEXT: v_writelane_b32 v40, s90, 36
+; SDAG-NEXT: v_writelane_b32 v40, s91, 37
+; SDAG-NEXT: v_writelane_b32 v40, s92, 38
+; SDAG-NEXT: v_writelane_b32 v40, s93, 39
+; SDAG-NEXT: v_writelane_b32 v40, s94, 40
+; SDAG-NEXT: v_writelane_b32 v40, s95, 41
+; SDAG-NEXT: v_writelane_b32 v40, s30, 42
; SDAG-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; SDAG-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; SDAG-NEXT: s_mov_b64 s[8:9], 0
; SDAG-NEXT: s_addk_i32 s32, 0x400
-; SDAG-NEXT: v_writelane_b32 v40, s95, 43
+; SDAG-NEXT: v_writelane_b32 v40, s31, 43
; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; SDAG-NEXT: v_readlane_b32 s95, v40, 43
-; SDAG-NEXT: v_readlane_b32 s94, v40, 42
-; SDAG-NEXT: v_readlane_b32 s93, v40, 41
-; SDAG-NEXT: v_readlane_b32 s92, v40, 40
-; SDAG-NEXT: v_readlane_b32 s91, v40, 39
-; SDAG-NEXT: v_readlane_b32 s90, v40, 38
-; SDAG-NEXT: v_readlane_b32 s89, v40, 37
-; SDAG-NEXT: v_readlane_b32 s88, v40, 36
-; SDAG-NEXT: v_readlane_b32 s79, v40, 35
-; SDAG-NEXT: v_readlane_b32 s78, v40, 34
-; SDAG-NEXT: v_readlane_b32 s77, v40, 33
-; SDAG-NEXT: v_readlane_b32 s76, v40, 32
-; SDAG-NEXT: v_readlane_b32 s75, v40, 31
-; SDAG-NEXT: v_readlane_b32 s74, v40, 30
-; SDAG-NEXT: v_readlane_b32 s73, v40, 29
-; SDAG-NEXT: v_readlane_b32 s72, v40, 28
-; SDAG-NEXT: v_readlane_b32 s31, v40, 27
-; SDAG-NEXT: v_readlane_b32 s30, v40, 26
+; SDAG-NEXT: v_readlane_b32 s30, v40, 42
+; SDAG-NEXT: v_readlane_b32 s31, v40, 43
+; SDAG-NEXT: v_readlane_b32 s95, v40, 41
+; SDAG-NEXT: v_readlane_b32 s94, v40, 40
+; SDAG-NEXT: v_readlane_b32 s93, v40, 39
+; SDAG-NEXT: v_readlane_b32 s92, v40, 38
+; SDAG-NEXT: v_readlane_b32 s91, v40, 37
+; SDAG-NEXT: v_readlane_b32 s90, v40, 36
+; SDAG-NEXT: v_readlane_b32 s89, v40, 35
+; SDAG-NEXT: v_readlane_b32 s88, v40, 34
+; SDAG-NEXT: v_readlane_b32 s79, v40, 33
+; SDAG-NEXT: v_readlane_b32 s78, v40, 32
+; SDAG-NEXT: v_readlane_b32 s77, v40, 31
+; SDAG-NEXT: v_readlane_b32 s76, v40, 30
+; SDAG-NEXT: v_readlane_b32 s75, v40, 29
+; SDAG-NEXT: v_readlane_b32 s74, v40, 28
+; SDAG-NEXT: v_readlane_b32 s73, v40, 27
+; SDAG-NEXT: v_readlane_b32 s72, v40, 26
; SDAG-NEXT: v_readlane_b32 s29, v40, 25
; SDAG-NEXT: v_readlane_b32 s28, v40, 24
; SDAG-NEXT: v_readlane_b32 s27, v40, 23
@@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: v_writelane_b32 v40, s27, 23
; GISEL-NEXT: v_writelane_b32 v40, s28, 24
; GISEL-NEXT: v_writelane_b32 v40, s29, 25
-; GISEL-NEXT: v_writelane_b32 v40, s30, 26
-; GISEL-NEXT: v_writelane_b32 v40, s31, 27
-; GISEL-NEXT: v_writelane_b32 v40, s72, 28
-; GISEL-NEXT: v_writelane_b32 v40, s73, 29
-; GISEL-NEXT: v_writelane_b32 v40, s74, 30
-; GISEL-NEXT: v_writelane_b32 v40, s75, 31
-; GISEL-NEXT: v_writelane_b32 v40, s76, 32
-; GISEL-NEXT: v_writelane_b32 v40, s77, 33
-; GISEL-NEXT: v_writelane_b32 v40, s78, 34
-; GISEL-NEXT: v_writelane_b32 v40, s79, 35
-; GISEL-NEXT: v_writelane_b32 v40, s88, 36
-; GISEL-NEXT: v_writelane_b32 v40, s89, 37
-; GISEL-NEXT: v_writelane_b32 v40, s90, 38
-; GISEL-NEXT: v_writelane_b32 v40, s91, 39
-; GISEL-NEXT: v_writelane_b32 v40, s92, 40
-; GISEL-NEXT: v_writelane_b32 v40, s93, 41
-; GISEL-NEXT: v_writelane_b32 v40, s94, 42
+; GISEL-NEXT: v_writelane_b32 v40, s72, 26
+; GISEL-NEXT: v_writelane_b32 v40, s73, 27
+; GISEL-NEXT: v_writelane_b32 v40, s74, 28
+; GISEL-NEXT: v_writelane_b32 v40, s75, 29
+; GISEL-NEXT: v_writelane_b32 v40, s76, 30
+; GISEL-NEXT: v_writelane_b32 v40, s77, 31
+; GISEL-NEXT: v_writelane_b32 v40, s78, 32
+; GISEL-NEXT: v_writelane_b32 v40, s79, 33
+; GISEL-NEXT: v_writelane_b32 v40, s88, 34
+; GISEL-NEXT: v_writelane_b32 v40, s89, 35
+; GISEL-NEXT: v_writelane_b32 v40, s90, 36
+; GISEL-NEXT: v_writelane_b32 v40, s91, 37
+; GISEL-NEXT: v_writelane_b32 v40, s92, 38
+; GISEL-NEXT: v_writelane_b32 v40, s93, 39
+; GISEL-NEXT: v_writelane_b32 v40, s94, 40
+; GISEL-NEXT: v_writelane_b32 v40, s95, 41
+; GISEL-NEXT: v_writelane_b32 v40, s30, 42
; GISEL-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; GISEL-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s95, 43
+; GISEL-NEXT: v_writelane_b32 v40, s31, 43
; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GISEL-NEXT: v_readlane_b32 s95, v40, 43
-; GISEL-NEXT: v_readlane_b32 s94, v40, 42
-; GISEL-NEXT: v_readlane_b32 s93, v40, 41
-; GISEL-NEXT: v_readlane_b32 s92, v40, 40
-; GISEL-NEXT: v_readlane_b32 s91, v40, 39
-; GISEL-NEXT: v_readlane_b32 s90, v40, 38
-; GISEL-NEXT: v_readlane_b32 s89, v40, 37
-; GISEL-NEXT: v_readlane_b32 s88, v40, 36
-; GISEL-NEXT: v_readlane_b32 s79, v40, 35
-; GISEL-NEXT: v_readlane_b32 s78, v40, 34
-; GISEL-NEXT: v_readlane_b32 s77, v40, 33
-; GISEL-NEXT: v_readlane_b32 s76, v40, 32
-; GISEL-NEXT: v_readlane_b32 s75, v40, 31
-; GISEL-NEXT: v_readlane_b32 s74, v40, 30
-; GISEL-NEXT: v_readlane_b32 s73, v40, 29
-; GISEL-NEXT: v_readlane_b32 s72, v40, 28
-; GISEL-NEXT: v_readlane_b32 s31, v40, 27
-; GISEL-NEXT: v_readlane_b32 s30, v40, 26
+; GISEL-NEXT: v_readlane_b32 s30, v40, 42
+; GISEL-NEXT: v_readlane_b32 s31, v40, 43
+; GISEL-NEXT: v_readlane_b32 s95, v40, 41
+; GISEL-NEXT: v_readlane_b32 s94, v40, 40
+; GISEL-NEXT: v_readlane_b32 s93, v40, 39
+; GISEL-NEXT: v_readlane_b32 s92, v40, 38
+; GISEL-NEXT: v_readlane_b32 s91, v40, 37
+; GISEL-NEXT: v_readlane_b32 s90, v40, 36
+; GISEL-NEXT: v_readlane_b32 s89, v40, 35
+; GISEL-NEXT: v_readlane_b32 s88, v40, 34
+; GISEL-NEXT: v_readlane_b32 s79, v40, 33
+; GISEL-NEXT: v_readlane_b32 s78, v40, 32
+; GISEL-NEXT: v_readlane_b32 s77, v40, 31
+; GISEL-NEXT: v_readlane_b32 s76, v40, 30
+; GISEL-NEXT: v_readlane_b32 s75, v40, 29
+; GISEL-NEXT: v_readlane_b32 s74, v40, 28
+; GISEL-NEXT: v_readlane_b32 s73, v40, 27
+; GISEL-NEXT: v_readlane_b32 s72, v40, 26
; GISEL-NEXT: v_readlane_b32 s29, v40, 25
; GISEL-NEXT: v_readlane_b32 s28, v40, 24
; GISEL-NEXT: v_readlane_b32 s27, v40, 23
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 234eaa8af7edf..39231558934a7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1713,8 +1713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1743,8 +1743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1779,8 +1779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1810,8 +1810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1840,8 +1840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1871,8 +1871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1907,8 +1907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1939,8 +1939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1970,8 +1970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2002,8 +2002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2077,8 +2077,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2109,8 +2109,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2143,8 +2143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2177,8 +2177,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2205,8 +2205,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2234,8 +2234,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -2262,8 +2262,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -2290,8 +2290,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2322,8 +2322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2350,8 +2350,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2379,8 +2379,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2407,8 +2407,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2440,8 +2440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2469,8 +2469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2498,8 +2498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2527,8 +2527,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2561,8 +2561,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2591,8 +2591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2621,8 +2621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2651,8 +2651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2687,8 +2687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2719,8 +2719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2750,8 +2750,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2782,8 +2782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2815,8 +2815,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2844,8 +2844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2873,8 +2873,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2902,8 +2902,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2937,8 +2937,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2968,8 +2968,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2998,8 +2998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3029,8 +3029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3066,8 +3066,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3099,8 +3099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3130,8 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3163,8 +3163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3200,8 +3200,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3233,8 +3233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3267,8 +3267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -3300,8 +3300,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -3333,8 +3333,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3371,8 +3371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3404,8 +3404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3437,8 +3437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3470,8 +3470,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3509,8 +3509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3543,8 +3543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3577,8 +3577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3611,8 +3611,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3652,8 +3652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: v_mov_b32_e32 v4, v6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3688,8 +3688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3724,8 +3724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: v_mov_b32_e32 v4, v6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3760,8 +3760,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, v5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3804,8 +3804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v4, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v8
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3843,8 +3843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3881,8 +3881,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3920,8 +3920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, v8
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3996,8 +3996,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v18, v33
; GFX9-NEXT: v_mov_b32_e32 v19, v34
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4068,8 +4068,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v18, v33
; GFX10-NEXT: v_mov_b32_e32 v19, v34
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -4135,8 +4135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX11-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
; GFX11-NEXT: v_mov_b32_e32 v19, v34
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -4207,8 +4207,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v18, v33
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v19, v34
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -4249,8 +4249,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4285,8 +4285,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -4322,8 +4322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4358,8 +4358,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4394,8 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -4442,8 +4442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4478,8 +4478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4522,8 +4522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -4566,8 +4566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4607,8 +4607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4663,8 +4663,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4701,8 +4701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v3, 2
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
@@ -4743,20 +4743,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[3:4], v2, off
; GFX11-TRUE16-NEXT: global_store_b16 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4795,8 +4795,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
@@ -4840,8 +4840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
@@ -4898,8 +4898,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4936,8 +4936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4985,11 +4985,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
@@ -5033,8 +5033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3
@@ -5082,8 +5082,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5146,8 +5146,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5186,8 +5186,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5242,7 +5242,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off
@@ -5251,7 +5251,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5293,8 +5292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3
@@ -5347,8 +5346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5419,8 +5418,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5464,12 +5463,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
+; GFX10-NEXT: v_readlane_b32 s30, v42, 0
; GFX10-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
@@ -5527,14 +5526,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5585,12 +5583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3
@@ -5642,12 +5640,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
@@ -5781,8 +5779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: v_readlane_b32 s30, v44, 0
+; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v44, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5904,8 +5902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v44, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6032,8 +6030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:12
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v44, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v44, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -6183,8 +6181,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33 offset:8
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:12
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v44, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v44, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -6306,8 +6304,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:4
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:8
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6342,8 +6340,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6370,8 +6368,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6399,8 +6397,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6427,8 +6425,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6460,8 +6458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6488,8 +6486,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6517,8 +6515,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6545,8 +6543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6578,8 +6576,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6606,8 +6604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6635,8 +6633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6663,8 +6661,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6697,8 +6695,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6726,8 +6724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6755,8 +6753,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6784,8 +6782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6817,8 +6815,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6846,8 +6844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6876,8 +6874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6905,8 +6903,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6937,8 +6935,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6965,8 +6963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6994,8 +6992,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7022,8 +7020,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7056,8 +7054,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7085,8 +7083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7115,8 +7113,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7144,8 +7142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7176,8 +7174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7204,8 +7202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7233,8 +7231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7261,8 +7259,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7294,8 +7292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7322,8 +7320,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7351,8 +7349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7379,8 +7377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7413,8 +7411,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7442,8 +7440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7471,8 +7469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7500,8 +7498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7534,8 +7532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7564,8 +7562,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7594,8 +7592,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7624,8 +7622,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7659,8 +7657,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7690,8 +7688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7720,8 +7718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7751,8 +7749,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7783,8 +7781,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7811,8 +7809,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7840,8 +7838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7868,8 +7866,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7904,8 +7902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7935,8 +7933,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7965,8 +7963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7996,8 +7994,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8032,8 +8030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8064,8 +8062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8095,8 +8093,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8127,8 +8125,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8163,8 +8161,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8196,8 +8194,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8230,8 +8228,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8263,8 +8261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8304,8 +8302,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8339,8 +8337,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8371,8 +8369,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8406,8 +8404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8444,8 +8442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8479,8 +8477,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8515,8 +8513,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8550,8 +8548,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8595,8 +8593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8634,8 +8632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8674,8 +8672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8713,8 +8711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8761,8 +8759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8803,8 +8801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8845,8 +8843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b32 off, v32, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8887,8 +8885,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8930,8 +8928,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8967,8 +8965,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9004,8 +9002,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v41, off, s33
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -9041,8 +9039,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9078,8 +9076,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -9111,8 +9109,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9145,8 +9143,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9178,8 +9176,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9211,8 +9209,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9249,8 +9247,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -9281,8 +9279,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9315,8 +9313,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9347,8 +9345,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9379,8 +9377,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9424,8 +9422,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -9466,8 +9464,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9508,8 +9506,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s33 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s33 offset:12
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -9548,8 +9546,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_u8 v0, off, s33 offset:8
; GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s33 offset:12
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -9590,8 +9588,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_ubyte v0, off, s33 offset:8
; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9662,8 +9660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v2, v17
; GFX9-NEXT: v_mov_b32_e32 v3, v18
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -9713,8 +9711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9761,8 +9759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16
; GFX11-NEXT: v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -9812,8 +9810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, v17
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, v18
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9841,46 +9839,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s35, 3
-; GFX9-NEXT: v_writelane_b32 v40, s36, 4
-; GFX9-NEXT: v_writelane_b32 v40, s37, 5
-; GFX9-NEXT: v_writelane_b32 v40, s38, 6
-; GFX9-NEXT: v_writelane_b32 v40, s39, 7
-; GFX9-NEXT: v_writelane_b32 v40, s48, 8
-; GFX9-NEXT: v_writelane_b32 v40, s49, 9
-; GFX9-NEXT: v_writelane_b32 v40, s50, 10
-; GFX9-NEXT: v_writelane_b32 v40, s51, 11
-; GFX9-NEXT: v_writelane_b32 v40, s52, 12
-; GFX9-NEXT: v_writelane_b32 v40, s53, 13
+; GFX9-NEXT: v_writelane_b32 v40, s34, 0
+; GFX9-NEXT: v_writelane_b32 v40, s35, 1
+; GFX9-NEXT: v_writelane_b32 v40, s36, 2
+; GFX9-NEXT: v_writelane_b32 v40, s37, 3
+; GFX9-NEXT: v_writelane_b32 v40, s38, 4
+; GFX9-NEXT: v_writelane_b32 v40, s39, 5
+; GFX9-NEXT: v_writelane_b32 v40, s48, 6
+; GFX9-NEXT: v_writelane_b32 v40, s49, 7
+; GFX9-NEXT: v_writelane_b32 v40, s50, 8
+; GFX9-NEXT: v_writelane_b32 v40, s51, 9
+; GFX9-NEXT: v_writelane_b32 v40, s52, 10
+; GFX9-NEXT: v_writelane_b32 v40, s53, 11
+; GFX9-NEXT: v_writelane_b32 v40, s54, 12
+; GFX9-NEXT: v_writelane_b32 v40, s55, 13
; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v40, s54, 14
+; GFX9-NEXT: v_writelane_b32 v40, s30, 14
; GFX9-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s55, 15
+; GFX9-NEXT: v_writelane_b32 v40, s31, 15
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s55, v40, 15
-; GFX9-NEXT: v_readlane_b32 s54, v40, 14
-; GFX9-NEXT: v_readlane_b32 s53, v40, 13
-; GFX9-NEXT: v_readlane_b32 s52, v40, 12
-; GFX9-NEXT: v_readlane_b32 s51, v40, 11
-; GFX9-NEXT: v_readlane_b32 s50, v40, 10
-; GFX9-NEXT: v_readlane_b32 s49, v40, 9
-; GFX9-NEXT: v_readlane_b32 s48, v40, 8
-; GFX9-NEXT: v_readlane_b32 s39, v40, 7
-; GFX9-NEXT: v_readlane_b32 s38, v40, 6
-; GFX9-NEXT: v_readlane_b32 s37, v40, 5
-; GFX9-NEXT: v_readlane_b32 s36, v40, 4
-; GFX9-NEXT: v_readlane_b32 s35, v40, 3
-; GFX9-NEXT: v_readlane_b32 s34, v40, 2
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s30, v40, 14
+; GFX9-NEXT: v_readlane_b32 s31, v40, 15
+; GFX9-NEXT: v_readlane_b32 s55, v40, 13
+; GFX9-NEXT: v_readlane_b32 s54, v40, 12
+; GFX9-NEXT: v_readlane_b32 s53, v40, 11
+; GFX9-NEXT: v_readlane_b32 s52, v40, 10
+; GFX9-NEXT: v_readlane_b32 s51, v40, 9
+; GFX9-NEXT: v_readlane_b32 s50, v40, 8
+; GFX9-NEXT: v_readlane_b32 s49, v40, 7
+; GFX9-NEXT: v_readlane_b32 s48, v40, 6
+; GFX9-NEXT: v_readlane_b32 s39, v40, 5
+; GFX9-NEXT: v_readlane_b32 s38, v40, 4
+; GFX9-NEXT: v_readlane_b32 s37, v40, 3
+; GFX9-NEXT: v_readlane_b32 s36, v40, 2
+; GFX9-NEXT: v_readlane_b32 s35, v40, 1
+; GFX9-NEXT: v_readlane_b32 s34, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9902,7 +9900,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
@@ -9910,38 +9908,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_writelane_b32 v40, s35, 3
-; GFX10-NEXT: v_writelane_b32 v40, s36, 4
-; GFX10-NEXT: v_writelane_b32 v40, s37, 5
-; GFX10-NEXT: v_writelane_b32 v40, s38, 6
-; GFX10-NEXT: v_writelane_b32 v40, s39, 7
-; GFX10-NEXT: v_writelane_b32 v40, s48, 8
-; GFX10-NEXT: v_writelane_b32 v40, s49, 9
-; GFX10-NEXT: v_writelane_b32 v40, s50, 10
-; GFX10-NEXT: v_writelane_b32 v40, s51, 11
-; GFX10-NEXT: v_writelane_b32 v40, s52, 12
-; GFX10-NEXT: v_writelane_b32 v40, s53, 13
-; GFX10-NEXT: v_writelane_b32 v40, s54, 14
-; GFX10-NEXT: v_writelane_b32 v40, s55, 15
+; GFX10-NEXT: v_writelane_b32 v40, s35, 1
+; GFX10-NEXT: v_writelane_b32 v40, s36, 2
+; GFX10-NEXT: v_writelane_b32 v40, s37, 3
+; GFX10-NEXT: v_writelane_b32 v40, s38, 4
+; GFX10-NEXT: v_writelane_b32 v40, s39, 5
+; GFX10-NEXT: v_writelane_b32 v40, s48, 6
+; GFX10-NEXT: v_writelane_b32 v40, s49, 7
+; GFX10-NEXT: v_writelane_b32 v40, s50, 8
+; GFX10-NEXT: v_writelane_b32 v40, s51, 9
+; GFX10-NEXT: v_writelane_b32 v40, s52, 10
+; GFX10-NEXT: v_writelane_b32 v40, s53, 11
+; GFX10-NEXT: v_writelane_b32 v40, s54, 12
+; GFX10-NEXT: v_writelane_b32 v40, s55, 13
+; GFX10-NEXT: v_writelane_b32 v40, s30, 14
+; GFX10-NEXT: v_writelane_b32 v40, s31, 15
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_readlane_b32 s55, v40, 15
-; GFX10-NEXT: v_readlane_b32 s54, v40, 14
-; GFX10-NEXT: v_readlane_b32 s53, v40, 13
-; GFX10-NEXT: v_readlane_b32 s52, v40, 12
-; GFX10-NEXT: v_readlane_b32 s51, v40, 11
-; GFX10-NEXT: v_readlane_b32 s50, v40, 10
-; GFX10-NEXT: v_readlane_b32 s49, v40, 9
-; GFX10-NEXT: v_readlane_b32 s48, v40, 8
-; GFX10-NEXT: v_readlane_b32 s39, v40, 7
-; GFX10-NEXT: v_readlane_b32 s38, v40, 6
-; GFX10-NEXT: v_readlane_b32 s37, v40, 5
-; GFX10-NEXT: v_readlane_b32 s36, v40, 4
-; GFX10-NEXT: v_readlane_b32 s35, v40, 3
-; GFX10-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s30, v40, 14
+; GFX10-NEXT: v_readlane_b32 s31, v40, 15
+; GFX10-NEXT: v_readlane_b32 s55, v40, 13
+; GFX10-NEXT: v_readlane_b32 s54, v40, 12
+; GFX10-NEXT: v_readlane_b32 s53, v40, 11
+; GFX10-NEXT: v_readlane_b32 s52, v40, 10
+; GFX10-NEXT: v_readlane_b32 s51, v40, 9
+; GFX10-NEXT: v_readlane_b32 s50, v40, 8
+; GFX10-NEXT: v_readlane_b32 s49, v40, 7
+; GFX10-NEXT: v_readlane_b32 s48, v40, 6
+; GFX10-NEXT: v_readlane_b32 s39, v40, 5
+; GFX10-NEXT: v_readlane_b32 s38, v40, 4
+; GFX10-NEXT: v_readlane_b32 s37, v40, 3
+; GFX10-NEXT: v_readlane_b32 s36, v40, 2
+; GFX10-NEXT: v_readlane_b32 s35, v40, 1
+; GFX10-NEXT: v_readlane_b32 s34, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9962,44 +9960,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16
; GFX11-NEXT: scratch_load_b32 v31, off, s33
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
-; GFX11-NEXT: v_writelane_b32 v40, s50, 10
-; GFX11-NEXT: v_writelane_b32 v40, s51, 11
-; GFX11-NEXT: v_writelane_b32 v40, s52, 12
-; GFX11-NEXT: v_writelane_b32 v40, s53, 13
-; GFX11-NEXT: v_writelane_b32 v40, s54, 14
-; GFX11-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s50, 8
+; GFX11-NEXT: v_writelane_b32 v40, s51, 9
+; GFX11-NEXT: v_writelane_b32 v40, s52, 10
+; GFX11-NEXT: v_writelane_b32 v40, s53, 11
+; GFX11-NEXT: v_writelane_b32 v40, s54, 12
+; GFX11-NEXT: v_writelane_b32 v40, s55, 13
+; GFX11-NEXT: v_writelane_b32 v40, s30, 14
+; GFX11-NEXT: v_writelane_b32 v40, s31, 15
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s55, v40, 15
-; GFX11-NEXT: v_readlane_b32 s54, v40, 14
-; GFX11-NEXT: v_readlane_b32 s53, v40, 13
-; GFX11-NEXT: v_readlane_b32 s52, v40, 12
-; GFX11-NEXT: v_readlane_b32 s51, v40, 11
-; GFX11-NEXT: v_readlane_b32 s50, v40, 10
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s30, v40, 14
+; GFX11-NEXT: v_readlane_b32 s31, v40, 15
+; GFX11-NEXT: v_readlane_b32 s55, v40, 13
+; GFX11-NEXT: v_readlane_b32 s54, v40, 12
+; GFX11-NEXT: v_readlane_b32 s53, v40, 11
+; GFX11-NEXT: v_readlane_b32 s52, v40, 10
+; GFX11-NEXT: v_readlane_b32 s51, v40, 9
+; GFX11-NEXT: v_readlane_b32 s50, v40, 8
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -10020,44 +10018,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16
; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 8
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 9
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 10
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 11
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 13
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 14
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 9
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 11
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 12
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 13
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 14
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 15
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 15
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 14
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 13
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 11
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 10
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 9
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 8
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 14
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 15
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 13
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 12
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 11
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 9
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -10091,8 +10089,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -10120,8 +10118,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -10150,8 +10148,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -10179,8 +10177,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -10212,8 +10210,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -10242,8 +10240,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -10273,8 +10271,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10303,8 +10301,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10337,8 +10335,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -10367,8 +10365,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -10398,8 +10396,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10428,8 +10426,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10462,8 +10460,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -10492,8 +10490,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -10523,8 +10521,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10553,8 +10551,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10589,8 +10587,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -10622,8 +10620,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -10656,8 +10654,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -10689,8 +10687,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -10728,8 +10726,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -10765,8 +10763,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -10803,8 +10801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -10840,8 +10838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -10884,8 +10882,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -10923,8 +10921,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -10963,8 +10961,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -11002,8 +11000,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -11047,8 +11045,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
+; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
@@ -11090,8 +11088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
@@ -11134,8 +11132,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
+; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -11177,8 +11175,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
@@ -11231,8 +11229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -11280,8 +11278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -11330,8 +11328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -11379,8 +11377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -11422,8 +11420,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -11452,8 +11450,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -11483,8 +11481,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -11513,8 +11511,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -11547,8 +11545,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -11577,8 +11575,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -11608,8 +11606,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -11638,8 +11636,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -11674,8 +11672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -11707,8 +11705,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -11741,8 +11739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -11774,8 +11772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -11813,8 +11811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
+; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
@@ -11849,8 +11847,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
@@ -11886,8 +11884,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
+; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -11922,8 +11920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
@@ -11966,8 +11964,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
+; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
@@ -12008,8 +12006,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
@@ -12051,8 +12049,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
+; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -12093,8 +12091,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
@@ -12133,8 +12131,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12166,8 +12164,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12200,8 +12198,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12233,8 +12231,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12274,8 +12272,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -12313,8 +12311,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -12353,8 +12351,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12392,8 +12390,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -12439,8 +12437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
+; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
@@ -12484,8 +12482,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
@@ -12530,8 +12528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
+; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -12575,8 +12573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
@@ -12614,8 +12612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -12644,8 +12642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -12675,8 +12673,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -12705,8 +12703,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -12741,8 +12739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12773,8 +12771,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12806,8 +12804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12838,8 +12836,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12875,8 +12873,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12907,8 +12905,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12940,8 +12938,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12972,8 +12970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13010,8 +13008,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13043,8 +13041,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13077,8 +13075,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13110,8 +13108,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13147,8 +13145,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13180,8 +13178,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13214,8 +13212,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13247,8 +13245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13283,8 +13281,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13315,8 +13313,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13348,8 +13346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13380,8 +13378,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13418,8 +13416,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13451,8 +13449,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13485,8 +13483,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13518,8 +13516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13553,8 +13551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -13583,8 +13581,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -13614,8 +13612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -13644,8 +13642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -13680,8 +13678,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13712,8 +13710,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13745,8 +13743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13777,8 +13775,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13815,8 +13813,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13848,8 +13846,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13882,8 +13880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13915,8 +13913,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13954,8 +13952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
+; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
@@ -13990,8 +13988,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
@@ -14027,8 +14025,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
+; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -14063,8 +14061,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
@@ -14105,8 +14103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -14144,8 +14142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -14184,8 +14182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -14223,8 +14221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -14263,8 +14261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -14299,8 +14297,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -14336,8 +14334,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -14372,8 +14370,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -14416,8 +14414,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -14455,8 +14453,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -14495,8 +14493,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -14534,8 +14532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -14579,8 +14577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
+; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
@@ -14621,8 +14619,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
@@ -14664,8 +14662,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
+; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -14706,8 +14704,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
@@ -14753,8 +14751,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -14799,8 +14797,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -14846,8 +14844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -14892,8 +14890,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -14949,8 +14947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -15000,8 +14998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -15052,8 +15050,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -15103,8 +15101,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -15161,8 +15159,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 17
; GFX9-NEXT: v_readlane_b32 s30, v40, 16
+; GFX9-NEXT: v_readlane_b32 s31, v40, 17
; GFX9-NEXT: v_readlane_b32 s19, v40, 15
; GFX9-NEXT: v_readlane_b32 s18, v40, 14
; GFX9-NEXT: v_readlane_b32 s17, v40, 13
@@ -15223,8 +15221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 16
; GFX10-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-NEXT: v_readlane_b32 s30, v40, 16
+; GFX10-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-NEXT: v_readlane_b32 s19, v40, 15
; GFX10-NEXT: v_readlane_b32 s18, v40, 14
; GFX10-NEXT: v_readlane_b32 s17, v40, 13
@@ -15286,8 +15284,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 17
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 17
; GFX11-NEXT: v_readlane_b32 s30, v40, 16
+; GFX11-NEXT: v_readlane_b32 s31, v40, 17
; GFX11-NEXT: v_readlane_b32 s19, v40, 15
; GFX11-NEXT: v_readlane_b32 s18, v40, 14
; GFX11-NEXT: v_readlane_b32 s17, v40, 13
@@ -15348,8 +15346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15
; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14
; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13
@@ -15450,8 +15448,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
+; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s29, v40, 25
; GFX9-NEXT: v_readlane_b32 s28, v40, 24
; GFX9-NEXT: v_readlane_b32 s27, v40, 23
@@ -15557,8 +15555,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-NEXT: v_readlane_b32 s27, v40, 23
@@ -15660,8 +15658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 27
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
+; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
; GFX11-NEXT: v_readlane_b32 s28, v40, 24
; GFX11-NEXT: v_readlane_b32 s27, v40, 23
@@ -15764,8 +15762,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23
@@ -15881,8 +15879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
+; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s29, v40, 25
; GFX9-NEXT: v_readlane_b32 s28, v40, 24
; GFX9-NEXT: v_readlane_b32 s27, v40, 23
@@ -15993,8 +15991,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-NEXT: v_readlane_b32 s27, v40, 23
@@ -16100,8 +16098,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 27
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
+; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
; GFX11-NEXT: v_readlane_b32 s28, v40, 24
; GFX11-NEXT: v_readlane_b32 s27, v40, 23
@@ -16210,8 +16208,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23
@@ -16276,8 +16274,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16310,8 +16308,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16340,8 +16338,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16370,8 +16368,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16442,8 +16440,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 11
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16509,8 +16507,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16556,8 +16554,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16620,8 +16618,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16712,8 +16710,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 7
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16787,8 +16785,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16838,8 +16836,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16908,8 +16906,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16996,8 +16994,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17071,8 +17069,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17127,8 +17125,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17197,8 +17195,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17237,8 +17235,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17264,8 +17262,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17292,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17319,8 +17317,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17351,8 +17349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17378,8 +17376,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17406,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17433,8 +17431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17465,8 +17463,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17492,8 +17490,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17520,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17547,8 +17545,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17579,8 +17577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17606,8 +17604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17634,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17661,8 +17659,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17693,8 +17691,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17720,8 +17718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17748,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17775,8 +17773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17807,8 +17805,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17834,8 +17832,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17862,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17889,8 +17887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17921,8 +17919,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17948,8 +17946,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17976,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18003,8 +18001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18035,8 +18033,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18062,8 +18060,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18090,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18117,8 +18115,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18149,8 +18147,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18176,8 +18174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18204,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18231,8 +18229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18263,8 +18261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18290,8 +18288,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18318,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18345,8 +18343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18377,8 +18375,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18404,8 +18402,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18432,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18459,8 +18457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18491,8 +18489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18518,8 +18516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18546,8 +18544,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18573,8 +18571,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18605,8 +18603,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18632,8 +18630,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18660,8 +18658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18687,8 +18685,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18719,8 +18717,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18746,8 +18744,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18774,8 +18772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18801,8 +18799,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 124de7e00f020..ecf2b33841076 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; clobber
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: v_readlane_b32 s31, v0, 3
; GFX9-NEXT: v_readlane_b32 s30, v0, 2
+; GFX9-NEXT: v_readlane_b32 s31, v0, 3
; GFX9-NEXT: v_readlane_b32 s29, v0, 1
; GFX9-NEXT: v_readlane_b32 s28, v0, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
@@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; clobber
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_readlane_b32 s31, v0, 3
; GFX10-NEXT: v_readlane_b32 s30, v0, 2
+; GFX10-NEXT: v_readlane_b32 s31, v0, 3
; GFX10-NEXT: v_readlane_b32 s29, v0, 1
; GFX10-NEXT: v_readlane_b32 s28, v0, 0
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
@@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX11-NEXT: ; clobber
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v0, 3
; GFX11-NEXT: v_readlane_b32 s30, v0, 2
+; GFX11-NEXT: v_readlane_b32 s31, v0, 3
; GFX11-NEXT: v_readlane_b32 s29, v0, 1
; GFX11-NEXT: v_readlane_b32 s28, v0, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
@@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s31
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s31
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s31
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX9-NEXT: ; use v31
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
+; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX10-NEXT: ; use v31
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
+; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX11-NEXT: ; use v31
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: v_readlane_b32 s30, v41, 0
+; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s33
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s33, s4
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s33
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s33
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: s_mov_b32 s34, s4
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s34
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s34, s4
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s34
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: s_mov_b32 s34, s4
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s34
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX9-NEXT: ; use v40
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
+; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX10-NEXT: ; use v40
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
+; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX11-NEXT: ; use v40
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: v_readlane_b32 s30, v41, 0
+; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s4, s40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s4
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX9-NEXT: ; use v40
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 2
; GFX9-NEXT: v_readlane_b32 s30, v41, 1
+; GFX9-NEXT: v_readlane_b32 s31, v41, 2
; GFX9-NEXT: v_readlane_b32 s4, v41, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 3
@@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: ; use v40
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 2
; GFX10-NEXT: v_readlane_b32 s30, v41, 1
+; GFX10-NEXT: v_readlane_b32 s31, v41, 2
; GFX10-NEXT: v_readlane_b32 s4, v41, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 3
@@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: ; use v40
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 2
; GFX11-NEXT: v_readlane_b32 s30, v41, 1
+; GFX11-NEXT: v_readlane_b32 s31, v41, 2
; GFX11-NEXT: v_readlane_b32 s4, v41, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 3
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index d8df20eb69452..4c7bef4aec091 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -35,8 +35,8 @@ define void @bar() {
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..ae13753004c3d 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v6, s30, 0
-; CHECK-NEXT: v_writelane_b32 v6, s31, 1
-; CHECK-NEXT: v_writelane_b32 v6, s36, 2
-; CHECK-NEXT: v_writelane_b32 v6, s37, 3
-; CHECK-NEXT: v_writelane_b32 v6, s38, 4
-; CHECK-NEXT: v_writelane_b32 v6, s39, 5
-; CHECK-NEXT: v_writelane_b32 v6, s48, 6
-; CHECK-NEXT: v_writelane_b32 v6, s49, 7
-; CHECK-NEXT: v_writelane_b32 v6, s50, 8
-; CHECK-NEXT: v_writelane_b32 v6, s51, 9
-; CHECK-NEXT: v_writelane_b32 v6, s52, 10
-; CHECK-NEXT: v_writelane_b32 v6, s53, 11
-; CHECK-NEXT: v_writelane_b32 v6, s54, 12
-; CHECK-NEXT: v_writelane_b32 v6, s55, 13
-; CHECK-NEXT: v_writelane_b32 v6, s64, 14
-; CHECK-NEXT: v_writelane_b32 v6, s65, 15
-; CHECK-NEXT: v_writelane_b32 v6, s66, 16
-; CHECK-NEXT: v_writelane_b32 v6, s67, 17
-; CHECK-NEXT: v_writelane_b32 v6, s68, 18
+; CHECK-NEXT: v_writelane_b32 v6, s36, 0
+; CHECK-NEXT: v_writelane_b32 v6, s37, 1
+; CHECK-NEXT: v_writelane_b32 v6, s38, 2
+; CHECK-NEXT: v_writelane_b32 v6, s39, 3
+; CHECK-NEXT: v_writelane_b32 v6, s48, 4
+; CHECK-NEXT: v_writelane_b32 v6, s49, 5
+; CHECK-NEXT: v_writelane_b32 v6, s50, 6
+; CHECK-NEXT: v_writelane_b32 v6, s51, 7
+; CHECK-NEXT: v_writelane_b32 v6, s52, 8
+; CHECK-NEXT: v_writelane_b32 v6, s53, 9
+; CHECK-NEXT: v_writelane_b32 v6, s54, 10
+; CHECK-NEXT: v_writelane_b32 v6, s55, 11
+; CHECK-NEXT: v_writelane_b32 v6, s64, 12
+; CHECK-NEXT: v_writelane_b32 v6, s65, 13
+; CHECK-NEXT: v_writelane_b32 v6, s66, 14
+; CHECK-NEXT: v_writelane_b32 v6, s67, 15
+; CHECK-NEXT: v_writelane_b32 v6, s68, 16
+; CHECK-NEXT: v_writelane_b32 v6, s69, 17
+; CHECK-NEXT: v_writelane_b32 v6, s70, 18
+; CHECK-NEXT: v_writelane_b32 v6, s71, 19
+; CHECK-NEXT: v_writelane_b32 v6, s30, 20
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: v_writelane_b32 v6, s69, 19
+; CHECK-NEXT: v_writelane_b32 v6, s31, 21
; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: s_mov_b32 s69, s4
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
@@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130
; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v6, s70, 20
-; CHECK-NEXT: v_writelane_b32 v6, s71, 21
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
+; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: v_writelane_b32 v7, s8, 0
; CHECK-NEXT: v_writelane_b32 v7, s9, 1
; CHECK-NEXT: v_writelane_b32 v7, s10, 2
@@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v7, s65, 29
; CHECK-NEXT: v_writelane_b32 v7, s66, 30
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
; CHECK-NEXT: s_mov_b32 s69, s68
-; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: s_mov_b32 s71, s68
; CHECK-NEXT: v_writelane_b32 v7, s67, 31
; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
@@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT: v_readlane_b32 s71, v6, 21
-; CHECK-NEXT: v_readlane_b32 s70, v6, 20
-; CHECK-NEXT: v_readlane_b32 s69, v6, 19
-; CHECK-NEXT: v_readlane_b32 s68, v6, 18
-; CHECK-NEXT: v_readlane_b32 s67, v6, 17
-; CHECK-NEXT: v_readlane_b32 s66, v6, 16
-; CHECK-NEXT: v_readlane_b32 s65, v6, 15
-; CHECK-NEXT: v_readlane_b32 s64, v6, 14
-; CHECK-NEXT: v_readlane_b32 s55, v6, 13
-; CHECK-NEXT: v_readlane_b32 s54, v6, 12
-; CHECK-NEXT: v_readlane_b32 s53, v6, 11
-; CHECK-NEXT: v_readlane_b32 s52, v6, 10
+; CHECK-NEXT: v_readlane_b32 s30, v6, 20
+; CHECK-NEXT: v_readlane_b32 s31, v6, 21
+; CHECK-NEXT: v_readlane_b32 s71, v6, 19
+; CHECK-NEXT: v_readlane_b32 s70, v6, 18
+; CHECK-NEXT: v_readlane_b32 s69, v6, 17
+; CHECK-NEXT: v_readlane_b32 s68, v6, 16
+; CHECK-NEXT: v_readlane_b32 s67, v6, 15
+; CHECK-NEXT: v_readlane_b32 s66, v6, 14
+; CHECK-NEXT: v_readlane_b32 s65, v6, 13
+; CHECK-NEXT: v_readlane_b32 s64, v6, 12
+; CHECK-NEXT: v_readlane_b32 s55, v6, 11
+; CHECK-NEXT: v_readlane_b32 s54, v6, 10
+; CHECK-NEXT: v_readlane_b32 s53, v6, 9
+; CHECK-NEXT: v_readlane_b32 s52, v6, 8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s51, v6, 9
-; CHECK-NEXT: v_readlane_b32 s50, v6, 8
-; CHECK-NEXT: v_readlane_b32 s49, v6, 7
-; CHECK-NEXT: v_readlane_b32 s48, v6, 6
-; CHECK-NEXT: v_readlane_b32 s39, v6, 5
-; CHECK-NEXT: v_readlane_b32 s38, v6, 4
-; CHECK-NEXT: v_readlane_b32 s37, v6, 3
-; CHECK-NEXT: v_readlane_b32 s36, v6, 2
-; CHECK-NEXT: v_readlane_b32 s31, v6, 1
-; CHECK-NEXT: v_readlane_b32 s30, v6, 0
+; CHECK-NEXT: v_readlane_b32 s51, v6, 7
+; CHECK-NEXT: v_readlane_b32 s50, v6, 6
+; CHECK-NEXT: v_readlane_b32 s49, v6, 5
+; CHECK-NEXT: v_readlane_b32 s48, v6, 4
+; CHECK-NEXT: v_readlane_b32 s39, v6, 3
+; CHECK-NEXT: v_readlane_b32 s38, v6, 2
+; CHECK-NEXT: v_readlane_b32 s37, v6, 1
+; CHECK-NEXT: v_readlane_b32 s36, v6, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a208cfdb197af..2aaaff1ecc407 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB2_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB2_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB3_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB3_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 20
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
-; GCN-NEXT: v_writelane_b32 v40, s66, 18
-; GCN-NEXT: v_writelane_b32 v40, s67, 19
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s66, 16
+; GCN-NEXT: v_writelane_b32 v40, s67, 17
+; GCN-NEXT: v_writelane_b32 v40, s30, 18
+; GCN-NEXT: v_writelane_b32 v40, s31, 19
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b64 exec, s[64:65]
; GCN-NEXT: .LBB5_4: ; %bb2
; GCN-NEXT: s_or_b64 exec, exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s67, v40, 19
-; GCN-NEXT: v_readlane_b32 s66, v40, 18
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 18
+; GCN-NEXT: v_readlane_b32 s31, v40, 19
+; GCN-NEXT: v_readlane_b32 s67, v40, 17
+; GCN-NEXT: v_readlane_b32 s66, v40, 16
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 20
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 20
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
-; GISEL-NEXT: v_writelane_b32 v40, s66, 18
-; GISEL-NEXT: v_writelane_b32 v40, s67, 19
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s66, 16
+; GISEL-NEXT: v_writelane_b32 v40, s67, 17
+; GISEL-NEXT: v_writelane_b32 v40, s30, 18
+; GISEL-NEXT: v_writelane_b32 v40, s31, 19
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b64 exec, s[64:65]
; GISEL-NEXT: .LBB5_4: ; %bb2
; GISEL-NEXT: s_or_b64 exec, exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s67, v40, 19
-; GISEL-NEXT: v_readlane_b32 s66, v40, 18
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 18
+; GISEL-NEXT: v_readlane_b32 s31, v40, 19
+; GISEL-NEXT: v_readlane_b32 s67, v40, 17
+; GISEL-NEXT: v_readlane_b32 s66, v40, 16
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 20
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
@@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB6_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[6:7], exec
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
@@ -944,22 +944,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB6_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -982,22 +982,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v41, s30, 0
-; GCN-NEXT: v_writelane_b32 v41, s31, 1
-; GCN-NEXT: v_writelane_b32 v41, s34, 2
-; GCN-NEXT: v_writelane_b32 v41, s35, 3
-; GCN-NEXT: v_writelane_b32 v41, s36, 4
-; GCN-NEXT: v_writelane_b32 v41, s37, 5
-; GCN-NEXT: v_writelane_b32 v41, s38, 6
-; GCN-NEXT: v_writelane_b32 v41, s39, 7
-; GCN-NEXT: v_writelane_b32 v41, s48, 8
-; GCN-NEXT: v_writelane_b32 v41, s49, 9
-; GCN-NEXT: v_writelane_b32 v41, s50, 10
-; GCN-NEXT: v_writelane_b32 v41, s51, 11
-; GCN-NEXT: v_writelane_b32 v41, s52, 12
-; GCN-NEXT: v_writelane_b32 v41, s53, 13
-; GCN-NEXT: v_writelane_b32 v41, s54, 14
-; GCN-NEXT: v_writelane_b32 v41, s55, 15
+; GCN-NEXT: v_writelane_b32 v41, s34, 0
+; GCN-NEXT: v_writelane_b32 v41, s35, 1
+; GCN-NEXT: v_writelane_b32 v41, s36, 2
+; GCN-NEXT: v_writelane_b32 v41, s37, 3
+; GCN-NEXT: v_writelane_b32 v41, s38, 4
+; GCN-NEXT: v_writelane_b32 v41, s39, 5
+; GCN-NEXT: v_writelane_b32 v41, s48, 6
+; GCN-NEXT: v_writelane_b32 v41, s49, 7
+; GCN-NEXT: v_writelane_b32 v41, s50, 8
+; GCN-NEXT: v_writelane_b32 v41, s51, 9
+; GCN-NEXT: v_writelane_b32 v41, s52, 10
+; GCN-NEXT: v_writelane_b32 v41, s53, 11
+; GCN-NEXT: v_writelane_b32 v41, s54, 12
+; GCN-NEXT: v_writelane_b32 v41, s55, 13
+; GCN-NEXT: v_writelane_b32 v41, s30, 14
+; GCN-NEXT: v_writelane_b32 v41, s31, 15
; GCN-NEXT: v_mov_b32_e32 v40, v0
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1013,22 +1013,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v40
-; GCN-NEXT: v_readlane_b32 s55, v41, 15
-; GCN-NEXT: v_readlane_b32 s54, v41, 14
-; GCN-NEXT: v_readlane_b32 s53, v41, 13
-; GCN-NEXT: v_readlane_b32 s52, v41, 12
-; GCN-NEXT: v_readlane_b32 s51, v41, 11
-; GCN-NEXT: v_readlane_b32 s50, v41, 10
-; GCN-NEXT: v_readlane_b32 s49, v41, 9
-; GCN-NEXT: v_readlane_b32 s48, v41, 8
-; GCN-NEXT: v_readlane_b32 s39, v41, 7
-; GCN-NEXT: v_readlane_b32 s38, v41, 6
-; GCN-NEXT: v_readlane_b32 s37, v41, 5
-; GCN-NEXT: v_readlane_b32 s36, v41, 4
-; GCN-NEXT: v_readlane_b32 s35, v41, 3
-; GCN-NEXT: v_readlane_b32 s34, v41, 2
-; GCN-NEXT: v_readlane_b32 s31, v41, 1
-; GCN-NEXT: v_readlane_b32 s30, v41, 0
+; GCN-NEXT: v_readlane_b32 s30, v41, 14
+; GCN-NEXT: v_readlane_b32 s31, v41, 15
+; GCN-NEXT: v_readlane_b32 s55, v41, 13
+; GCN-NEXT: v_readlane_b32 s54, v41, 12
+; GCN-NEXT: v_readlane_b32 s53, v41, 11
+; GCN-NEXT: v_readlane_b32 s52, v41, 10
+; GCN-NEXT: v_readlane_b32 s51, v41, 9
+; GCN-NEXT: v_readlane_b32 s50, v41, 8
+; GCN-NEXT: v_readlane_b32 s49, v41, 7
+; GCN-NEXT: v_readlane_b32 s48, v41, 6
+; GCN-NEXT: v_readlane_b32 s39, v41, 5
+; GCN-NEXT: v_readlane_b32 s38, v41, 4
+; GCN-NEXT: v_readlane_b32 s37, v41, 3
+; GCN-NEXT: v_readlane_b32 s36, v41, 2
+; GCN-NEXT: v_readlane_b32 s35, v41, 1
+; GCN-NEXT: v_readlane_b32 s34, v41, 0
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
@@ -1048,22 +1048,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT: v_writelane_b32 v41, s30, 0
-; GISEL-NEXT: v_writelane_b32 v41, s31, 1
-; GISEL-NEXT: v_writelane_b32 v41, s34, 2
-; GISEL-NEXT: v_writelane_b32 v41, s35, 3
-; GISEL-NEXT: v_writelane_b32 v41, s36, 4
-; GISEL-NEXT: v_writelane_b32 v41, s37, 5
-; GISEL-NEXT: v_writelane_b32 v41, s38, 6
-; GISEL-NEXT: v_writelane_b32 v41, s39, 7
-; GISEL-NEXT: v_writelane_b32 v41, s48, 8
-; GISEL-NEXT: v_writelane_b32 v41, s49, 9
-; GISEL-NEXT: v_writelane_b32 v41, s50, 10
-; GISEL-NEXT: v_writelane_b32 v41, s51, 11
-; GISEL-NEXT: v_writelane_b32 v41, s52, 12
-; GISEL-NEXT: v_writelane_b32 v41, s53, 13
-; GISEL-NEXT: v_writelane_b32 v41, s54, 14
-; GISEL-NEXT: v_writelane_b32 v41, s55, 15
+; GISEL-NEXT: v_writelane_b32 v41, s34, 0
+; GISEL-NEXT: v_writelane_b32 v41, s35, 1
+; GISEL-NEXT: v_writelane_b32 v41, s36, 2
+; GISEL-NEXT: v_writelane_b32 v41, s37, 3
+; GISEL-NEXT: v_writelane_b32 v41, s38, 4
+; GISEL-NEXT: v_writelane_b32 v41, s39, 5
+; GISEL-NEXT: v_writelane_b32 v41, s48, 6
+; GISEL-NEXT: v_writelane_b32 v41, s49, 7
+; GISEL-NEXT: v_writelane_b32 v41, s50, 8
+; GISEL-NEXT: v_writelane_b32 v41, s51, 9
+; GISEL-NEXT: v_writelane_b32 v41, s52, 10
+; GISEL-NEXT: v_writelane_b32 v41, s53, 11
+; GISEL-NEXT: v_writelane_b32 v41, s54, 12
+; GISEL-NEXT: v_writelane_b32 v41, s55, 13
+; GISEL-NEXT: v_writelane_b32 v41, s30, 14
+; GISEL-NEXT: v_writelane_b32 v41, s31, 15
; GISEL-NEXT: v_mov_b32_e32 v40, v0
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1079,22 +1079,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, v40
-; GISEL-NEXT: v_readlane_b32 s55, v41, 15
-; GISEL-NEXT: v_readlane_b32 s54, v41, 14
-; GISEL-NEXT: v_readlane_b32 s53, v41, 13
-; GISEL-NEXT: v_readlane_b32 s52, v41, 12
-; GISEL-NEXT: v_readlane_b32 s51, v41, 11
-; GISEL-NEXT: v_readlane_b32 s50, v41, 10
-; GISEL-NEXT: v_readlane_b32 s49, v41, 9
-; GISEL-NEXT: v_readlane_b32 s48, v41, 8
-; GISEL-NEXT: v_readlane_b32 s39, v41, 7
-; GISEL-NEXT: v_readlane_b32 s38, v41, 6
-; GISEL-NEXT: v_readlane_b32 s37, v41, 5
-; GISEL-NEXT: v_readlane_b32 s36, v41, 4
-; GISEL-NEXT: v_readlane_b32 s35, v41, 3
-; GISEL-NEXT: v_readlane_b32 s34, v41, 2
-; GISEL-NEXT: v_readlane_b32 s31, v41, 1
-; GISEL-NEXT: v_readlane_b32 s30, v41, 0
+; GISEL-NEXT: v_readlane_b32 s30, v41, 14
+; GISEL-NEXT: v_readlane_b32 s31, v41, 15
+; GISEL-NEXT: v_readlane_b32 s55, v41, 13
+; GISEL-NEXT: v_readlane_b32 s54, v41, 12
+; GISEL-NEXT: v_readlane_b32 s53, v41, 11
+; GISEL-NEXT: v_readlane_b32 s52, v41, 10
+; GISEL-NEXT: v_readlane_b32 s51, v41, 9
+; GISEL-NEXT: v_readlane_b32 s50, v41, 8
+; GISEL-NEXT: v_readlane_b32 s49, v41, 7
+; GISEL-NEXT: v_readlane_b32 s48, v41, 6
+; GISEL-NEXT: v_readlane_b32 s39, v41, 5
+; GISEL-NEXT: v_readlane_b32 s38, v41, 4
+; GISEL-NEXT: v_readlane_b32 s37, v41, 3
+; GISEL-NEXT: v_readlane_b32 s36, v41, 2
+; GISEL-NEXT: v_readlane_b32 s35, v41, 1
+; GISEL-NEXT: v_readlane_b32 s34, v41, 0
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
@@ -1121,22 +1121,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v1
@@ -1152,22 +1152,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v3
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1185,22 +1185,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v1
@@ -1216,22 +1216,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, v2
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1254,22 +1254,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s6, v0
@@ -1282,22 +1282,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB9_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1315,22 +1315,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s6, v0
@@ -1343,22 +1343,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB9_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index c3f391786f878..fcc43ffd0140e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -35,8 +35,8 @@ define void @f0() {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v4, 1
; GFX11-NEXT: v_readlane_b32 s30, v4, 0
+; GFX11-NEXT: v_readlane_b32 s31, v4, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index df784a6e00cfc..19869e85ec9d9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 {
; CHECK-LABEL: name: foo
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+ ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32
; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: BUFFER_GL1_INV implicit $exec
; CHECK-NEXT: BUFFER_GL0_INV implicit $exec
; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
; CHECK-NEXT: S_WAITCNT 49279
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo
@@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.DummyReturnBlock:
+ ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31
; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1
- ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0
; CHECK-NEXT: $sgpr32 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2
; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5
; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4
; CHECK-NEXT: S_WAITCNT 16240
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 17581bcb61e99..8c2c16ccdc2a0 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: s_add_i32 s6, s32, 0x101100
; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v23, s30, 0
-; GFX7-NEXT: v_writelane_b32 v23, s31, 1
-; GFX7-NEXT: v_writelane_b32 v23, s33, 2
-; GFX7-NEXT: v_writelane_b32 v23, s34, 3
-; GFX7-NEXT: v_writelane_b32 v23, s35, 4
-; GFX7-NEXT: v_writelane_b32 v23, s36, 5
-; GFX7-NEXT: v_writelane_b32 v23, s37, 6
-; GFX7-NEXT: v_writelane_b32 v23, s38, 7
-; GFX7-NEXT: v_writelane_b32 v23, s39, 8
-; GFX7-NEXT: v_writelane_b32 v23, s48, 9
-; GFX7-NEXT: v_writelane_b32 v23, s49, 10
-; GFX7-NEXT: v_writelane_b32 v23, s50, 11
-; GFX7-NEXT: v_writelane_b32 v23, s51, 12
-; GFX7-NEXT: v_writelane_b32 v23, s52, 13
-; GFX7-NEXT: v_writelane_b32 v23, s53, 14
+; GFX7-NEXT: v_writelane_b32 v23, s33, 0
+; GFX7-NEXT: v_writelane_b32 v23, s34, 1
+; GFX7-NEXT: v_writelane_b32 v23, s35, 2
+; GFX7-NEXT: v_writelane_b32 v23, s36, 3
+; GFX7-NEXT: v_writelane_b32 v23, s37, 4
+; GFX7-NEXT: v_writelane_b32 v23, s38, 5
+; GFX7-NEXT: v_writelane_b32 v23, s39, 6
+; GFX7-NEXT: v_writelane_b32 v23, s48, 7
+; GFX7-NEXT: v_writelane_b32 v23, s49, 8
+; GFX7-NEXT: v_writelane_b32 v23, s50, 9
+; GFX7-NEXT: v_writelane_b32 v23, s51, 10
+; GFX7-NEXT: v_writelane_b32 v23, s52, 11
+; GFX7-NEXT: v_writelane_b32 v23, s53, 12
+; GFX7-NEXT: v_writelane_b32 v23, s54, 13
+; GFX7-NEXT: v_writelane_b32 v23, s55, 14
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT: v_writelane_b32 v23, s54, 15
+; GFX7-NEXT: v_writelane_b32 v23, s30, 15
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s55, 16
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v23, 16
-; GFX7-NEXT: v_readlane_b32 s54, v23, 15
-; GFX7-NEXT: v_readlane_b32 s53, v23, 14
-; GFX7-NEXT: v_readlane_b32 s52, v23, 13
-; GFX7-NEXT: v_readlane_b32 s51, v23, 12
-; GFX7-NEXT: v_readlane_b32 s50, v23, 11
-; GFX7-NEXT: v_readlane_b32 s49, v23, 10
-; GFX7-NEXT: v_readlane_b32 s48, v23, 9
-; GFX7-NEXT: v_readlane_b32 s39, v23, 8
-; GFX7-NEXT: v_readlane_b32 s38, v23, 7
-; GFX7-NEXT: v_readlane_b32 s37, v23, 6
-; GFX7-NEXT: v_readlane_b32 s36, v23, 5
-; GFX7-NEXT: v_readlane_b32 s35, v23, 4
-; GFX7-NEXT: v_readlane_b32 s34, v23, 3
-; GFX7-NEXT: v_readlane_b32 s33, v23, 2
-; GFX7-NEXT: v_readlane_b32 s31, v23, 1
-; GFX7-NEXT: v_readlane_b32 s30, v23, 0
+; GFX7-NEXT: v_readlane_b32 s30, v23, 15
+; GFX7-NEXT: v_readlane_b32 s31, v23, 16
+; GFX7-NEXT: v_readlane_b32 s55, v23, 14
+; GFX7-NEXT: v_readlane_b32 s54, v23, 13
+; GFX7-NEXT: v_readlane_b32 s53, v23, 12
+; GFX7-NEXT: v_readlane_b32 s52, v23, 11
+; GFX7-NEXT: v_readlane_b32 s51, v23, 10
+; GFX7-NEXT: v_readlane_b32 s50, v23, 9
+; GFX7-NEXT: v_readlane_b32 s49, v23, 8
+; GFX7-NEXT: v_readlane_b32 s48, v23, 7
+; GFX7-NEXT: v_readlane_b32 s39, v23, 6
+; GFX7-NEXT: v_readlane_b32 s38, v23, 5
+; GFX7-NEXT: v_readlane_b32 s37, v23, 4
+; GFX7-NEXT: v_readlane_b32 s36, v23, 3
+; GFX7-NEXT: v_readlane_b32 s35, v23, 2
+; GFX7-NEXT: v_readlane_b32 s34, v23, 1
+; GFX7-NEXT: v_readlane_b32 s33, v23, 0
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x101100
; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v23, s30, 0
-; GFX8-NEXT: v_writelane_b32 v23, s31, 1
-; GFX8-NEXT: v_writelane_b32 v23, s33, 2
-; GFX8-NEXT: v_writelane_b32 v23, s34, 3
-; GFX8-NEXT: v_writelane_b32 v23, s35, 4
-; GFX8-NEXT: v_writelane_b32 v23, s36, 5
-; GFX8-NEXT: v_writelane_b32 v23, s37, 6
-; GFX8-NEXT: v_writelane_b32 v23, s38, 7
-; GFX8-NEXT: v_writelane_b32 v23, s39, 8
-; GFX8-NEXT: v_writelane_b32 v23, s48, 9
-; GFX8-NEXT: v_writelane_b32 v23, s49, 10
-; GFX8-NEXT: v_writelane_b32 v23, s50, 11
-; GFX8-NEXT: v_writelane_b32 v23, s51, 12
-; GFX8-NEXT: v_writelane_b32 v23, s52, 13
-; GFX8-NEXT: v_writelane_b32 v23, s53, 14
+; GFX8-NEXT: v_writelane_b32 v23, s33, 0
+; GFX8-NEXT: v_writelane_b32 v23, s34, 1
+; GFX8-NEXT: v_writelane_b32 v23, s35, 2
+; GFX8-NEXT: v_writelane_b32 v23, s36, 3
+; GFX8-NEXT: v_writelane_b32 v23, s37, 4
+; GFX8-NEXT: v_writelane_b32 v23, s38, 5
+; GFX8-NEXT: v_writelane_b32 v23, s39, 6
+; GFX8-NEXT: v_writelane_b32 v23, s48, 7
+; GFX8-NEXT: v_writelane_b32 v23, s49, 8
+; GFX8-NEXT: v_writelane_b32 v23, s50, 9
+; GFX8-NEXT: v_writelane_b32 v23, s51, 10
+; GFX8-NEXT: v_writelane_b32 v23, s52, 11
+; GFX8-NEXT: v_writelane_b32 v23, s53, 12
+; GFX8-NEXT: v_writelane_b32 v23, s54, 13
+; GFX8-NEXT: v_writelane_b32 v23, s55, 14
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: v_writelane_b32 v23, s54, 15
+; GFX8-NEXT: v_writelane_b32 v23, s30, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s55, 16
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v23, 16
-; GFX8-NEXT: v_readlane_b32 s54, v23, 15
-; GFX8-NEXT: v_readlane_b32 s53, v23, 14
-; GFX8-NEXT: v_readlane_b32 s52, v23, 13
-; GFX8-NEXT: v_readlane_b32 s51, v23, 12
-; GFX8-NEXT: v_readlane_b32 s50, v23, 11
-; GFX8-NEXT: v_readlane_b32 s49, v23, 10
-; GFX8-NEXT: v_readlane_b32 s48, v23, 9
-; GFX8-NEXT: v_readlane_b32 s39, v23, 8
-; GFX8-NEXT: v_readlane_b32 s38, v23, 7
-; GFX8-NEXT: v_readlane_b32 s37, v23, 6
-; GFX8-NEXT: v_readlane_b32 s36, v23, 5
-; GFX8-NEXT: v_readlane_b32 s35, v23, 4
-; GFX8-NEXT: v_readlane_b32 s34, v23, 3
-; GFX8-NEXT: v_readlane_b32 s33, v23, 2
-; GFX8-NEXT: v_readlane_b32 s31, v23, 1
-; GFX8-NEXT: v_readlane_b32 s30, v23, 0
+; GFX8-NEXT: v_readlane_b32 s30, v23, 15
+; GFX8-NEXT: v_readlane_b32 s31, v23, 16
+; GFX8-NEXT: v_readlane_b32 s55, v23, 14
+; GFX8-NEXT: v_readlane_b32 s54, v23, 13
+; GFX8-NEXT: v_readlane_b32 s53, v23, 12
+; GFX8-NEXT: v_readlane_b32 s52, v23, 11
+; GFX8-NEXT: v_readlane_b32 s51, v23, 10
+; GFX8-NEXT: v_readlane_b32 s50, v23, 9
+; GFX8-NEXT: v_readlane_b32 s49, v23, 8
+; GFX8-NEXT: v_readlane_b32 s48, v23, 7
+; GFX8-NEXT: v_readlane_b32 s39, v23, 6
+; GFX8-NEXT: v_readlane_b32 s38, v23, 5
+; GFX8-NEXT: v_readlane_b32 s37, v23, 4
+; GFX8-NEXT: v_readlane_b32 s36, v23, 3
+; GFX8-NEXT: v_readlane_b32 s35, v23, 2
+; GFX8-NEXT: v_readlane_b32 s34, v23, 1
+; GFX8-NEXT: v_readlane_b32 s33, v23, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
; GFX900-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v23, s30, 0
-; GFX900-NEXT: v_writelane_b32 v23, s31, 1
-; GFX900-NEXT: v_writelane_b32 v23, s33, 2
-; GFX900-NEXT: v_writelane_b32 v23, s34, 3
-; GFX900-NEXT: v_writelane_b32 v23, s35, 4
-; GFX900-NEXT: v_writelane_b32 v23, s36, 5
-; GFX900-NEXT: v_writelane_b32 v23, s37, 6
-; GFX900-NEXT: v_writelane_b32 v23, s38, 7
-; GFX900-NEXT: v_writelane_b32 v23, s39, 8
-; GFX900-NEXT: v_writelane_b32 v23, s48, 9
-; GFX900-NEXT: v_writelane_b32 v23, s49, 10
-; GFX900-NEXT: v_writelane_b32 v23, s50, 11
-; GFX900-NEXT: v_writelane_b32 v23, s51, 12
-; GFX900-NEXT: v_writelane_b32 v23, s52, 13
-; GFX900-NEXT: v_writelane_b32 v23, s53, 14
+; GFX900-NEXT: v_writelane_b32 v23, s33, 0
+; GFX900-NEXT: v_writelane_b32 v23, s34, 1
+; GFX900-NEXT: v_writelane_b32 v23, s35, 2
+; GFX900-NEXT: v_writelane_b32 v23, s36, 3
+; GFX900-NEXT: v_writelane_b32 v23, s37, 4
+; GFX900-NEXT: v_writelane_b32 v23, s38, 5
+; GFX900-NEXT: v_writelane_b32 v23, s39, 6
+; GFX900-NEXT: v_writelane_b32 v23, s48, 7
+; GFX900-NEXT: v_writelane_b32 v23, s49, 8
+; GFX900-NEXT: v_writelane_b32 v23, s50, 9
+; GFX900-NEXT: v_writelane_b32 v23, s51, 10
+; GFX900-NEXT: v_writelane_b32 v23, s52, 11
+; GFX900-NEXT: v_writelane_b32 v23, s53, 12
+; GFX900-NEXT: v_writelane_b32 v23, s54, 13
+; GFX900-NEXT: v_writelane_b32 v23, s55, 14
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT: v_writelane_b32 v23, s54, 15
+; GFX900-NEXT: v_writelane_b32 v23, s30, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s55, 16
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v23, 16
-; GFX900-NEXT: v_readlane_b32 s54, v23, 15
-; GFX900-NEXT: v_readlane_b32 s53, v23, 14
-; GFX900-NEXT: v_readlane_b32 s52, v23, 13
-; GFX900-NEXT: v_readlane_b32 s51, v23, 12
-; GFX900-NEXT: v_readlane_b32 s50, v23, 11
-; GFX900-NEXT: v_readlane_b32 s49, v23, 10
-; GFX900-NEXT: v_readlane_b32 s48, v23, 9
-; GFX900-NEXT: v_readlane_b32 s39, v23, 8
-; GFX900-NEXT: v_readlane_b32 s38, v23, 7
-; GFX900-NEXT: v_readlane_b32 s37, v23, 6
-; GFX900-NEXT: v_readlane_b32 s36, v23, 5
-; GFX900-NEXT: v_readlane_b32 s35, v23, 4
-; GFX900-NEXT: v_readlane_b32 s34, v23, 3
-; GFX900-NEXT: v_readlane_b32 s33, v23, 2
-; GFX900-NEXT: v_readlane_b32 s31, v23, 1
-; GFX900-NEXT: v_readlane_b32 s30, v23, 0
+; GFX900-NEXT: v_readlane_b32 s30, v23, 15
+; GFX900-NEXT: v_readlane_b32 s31, v23, 16
+; GFX900-NEXT: v_readlane_b32 s55, v23, 14
+; GFX900-NEXT: v_readlane_b32 s54, v23, 13
+; GFX900-NEXT: v_readlane_b32 s53, v23, 12
+; GFX900-NEXT: v_readlane_b32 s52, v23, 11
+; GFX900-NEXT: v_readlane_b32 s51, v23, 10
+; GFX900-NEXT: v_readlane_b32 s50, v23, 9
+; GFX900-NEXT: v_readlane_b32 s49, v23, 8
+; GFX900-NEXT: v_readlane_b32 s48, v23, 7
+; GFX900-NEXT: v_readlane_b32 s39, v23, 6
+; GFX900-NEXT: v_readlane_b32 s38, v23, 5
+; GFX900-NEXT: v_readlane_b32 s37, v23, 4
+; GFX900-NEXT: v_readlane_b32 s36, v23, 3
+; GFX900-NEXT: v_readlane_b32 s35, v23, 2
+; GFX900-NEXT: v_readlane_b32 s34, v23, 1
+; GFX900-NEXT: v_readlane_b32 s33, v23, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_store_dword off, v23, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v23, s30, 0
-; GFX942-NEXT: v_writelane_b32 v23, s31, 1
-; GFX942-NEXT: v_writelane_b32 v23, s33, 2
-; GFX942-NEXT: v_writelane_b32 v23, s34, 3
-; GFX942-NEXT: v_writelane_b32 v23, s35, 4
-; GFX942-NEXT: v_writelane_b32 v23, s36, 5
-; GFX942-NEXT: v_writelane_b32 v23, s37, 6
-; GFX942-NEXT: v_writelane_b32 v23, s38, 7
-; GFX942-NEXT: v_writelane_b32 v23, s39, 8
-; GFX942-NEXT: v_writelane_b32 v23, s48, 9
-; GFX942-NEXT: v_writelane_b32 v23, s49, 10
-; GFX942-NEXT: v_writelane_b32 v23, s50, 11
-; GFX942-NEXT: v_writelane_b32 v23, s51, 12
-; GFX942-NEXT: v_writelane_b32 v23, s52, 13
-; GFX942-NEXT: v_writelane_b32 v23, s53, 14
+; GFX942-NEXT: v_writelane_b32 v23, s33, 0
+; GFX942-NEXT: v_writelane_b32 v23, s34, 1
+; GFX942-NEXT: v_writelane_b32 v23, s35, 2
+; GFX942-NEXT: v_writelane_b32 v23, s36, 3
+; GFX942-NEXT: v_writelane_b32 v23, s37, 4
+; GFX942-NEXT: v_writelane_b32 v23, s38, 5
+; GFX942-NEXT: v_writelane_b32 v23, s39, 6
+; GFX942-NEXT: v_writelane_b32 v23, s48, 7
+; GFX942-NEXT: v_writelane_b32 v23, s49, 8
+; GFX942-NEXT: v_writelane_b32 v23, s50, 9
+; GFX942-NEXT: v_writelane_b32 v23, s51, 10
+; GFX942-NEXT: v_writelane_b32 v23, s52, 11
+; GFX942-NEXT: v_writelane_b32 v23, s53, 12
+; GFX942-NEXT: v_writelane_b32 v23, s54, 13
+; GFX942-NEXT: v_writelane_b32 v23, s55, 14
; GFX942-NEXT: s_add_i32 s0, s32, 64
-; GFX942-NEXT: v_writelane_b32 v23, s54, 15
+; GFX942-NEXT: v_writelane_b32 v23, s30, 15
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: v_writelane_b32 v23, s55, 16
+; GFX942-NEXT: v_writelane_b32 v23, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
@@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v23, 16
-; GFX942-NEXT: v_readlane_b32 s54, v23, 15
-; GFX942-NEXT: v_readlane_b32 s53, v23, 14
-; GFX942-NEXT: v_readlane_b32 s52, v23, 13
-; GFX942-NEXT: v_readlane_b32 s51, v23, 12
-; GFX942-NEXT: v_readlane_b32 s50, v23, 11
-; GFX942-NEXT: v_readlane_b32 s49, v23, 10
-; GFX942-NEXT: v_readlane_b32 s48, v23, 9
-; GFX942-NEXT: v_readlane_b32 s39, v23, 8
-; GFX942-NEXT: v_readlane_b32 s38, v23, 7
-; GFX942-NEXT: v_readlane_b32 s37, v23, 6
-; GFX942-NEXT: v_readlane_b32 s36, v23, 5
-; GFX942-NEXT: v_readlane_b32 s35, v23, 4
-; GFX942-NEXT: v_readlane_b32 s34, v23, 3
-; GFX942-NEXT: v_readlane_b32 s33, v23, 2
-; GFX942-NEXT: v_readlane_b32 s31, v23, 1
-; GFX942-NEXT: v_readlane_b32 s30, v23, 0
+; GFX942-NEXT: v_readlane_b32 s30, v23, 15
+; GFX942-NEXT: v_readlane_b32 s31, v23, 16
+; GFX942-NEXT: v_readlane_b32 s55, v23, 14
+; GFX942-NEXT: v_readlane_b32 s54, v23, 13
+; GFX942-NEXT: v_readlane_b32 s53, v23, 12
+; GFX942-NEXT: v_readlane_b32 s52, v23, 11
+; GFX942-NEXT: v_readlane_b32 s51, v23, 10
+; GFX942-NEXT: v_readlane_b32 s50, v23, 9
+; GFX942-NEXT: v_readlane_b32 s49, v23, 8
+; GFX942-NEXT: v_readlane_b32 s48, v23, 7
+; GFX942-NEXT: v_readlane_b32 s39, v23, 6
+; GFX942-NEXT: v_readlane_b32 s38, v23, 5
+; GFX942-NEXT: v_readlane_b32 s37, v23, 4
+; GFX942-NEXT: v_readlane_b32 s36, v23, 3
+; GFX942-NEXT: v_readlane_b32 s35, v23, 2
+; GFX942-NEXT: v_readlane_b32 s34, v23, 1
+; GFX942-NEXT: v_readlane_b32 s33, v23, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
@@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v23, s33, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT: v_writelane_b32 v23, s34, 1
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v23, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v23, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v23, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v23, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v23, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v23, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v23, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v23, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v23, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v23, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v23, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v23, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v23, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v23, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v23, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v23, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v23, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v23, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v23, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v23, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v23, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v23, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v23, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v23, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v23, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v23, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v23, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v23, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v23, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v23, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v23, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v23, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v23, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v23, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v23, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v23, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v23, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v23, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v23, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v23, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v23, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v23, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v23, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v23, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v23, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v23, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v23, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v23, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v23, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v23, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v23, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v23, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v23, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v23, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v23, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v23, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v23, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v23, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_1-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v23, s33, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT: v_writelane_b32 v23, s34, 1
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v23, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v23, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v23, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v23, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v23, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v23, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v23, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v23, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v23, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v23, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v23, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v23, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v23, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v23, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v23, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v23, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v23, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v23, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v23, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v23, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v23, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v23, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v23, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v23, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v23, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v23, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v23, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v23, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v23, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v23, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v23, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v23, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v23, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v23, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v23, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v23, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v23, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v23, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v23, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v23, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v23, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v23, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v23, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v23, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v23, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v23, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v23, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v23, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v23, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v23, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v23, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v23, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v23, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v23, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v23, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v23, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v23, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v23, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_3-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v23, s30, 0
+; GFX11-NEXT: v_writelane_b32 v23, s33, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v23, s31, 1
+; GFX11-NEXT: v_writelane_b32 v23, s34, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v23, s33, 2
-; GFX11-NEXT: v_writelane_b32 v23, s34, 3
-; GFX11-NEXT: v_writelane_b32 v23, s35, 4
-; GFX11-NEXT: v_writelane_b32 v23, s36, 5
-; GFX11-NEXT: v_writelane_b32 v23, s37, 6
-; GFX11-NEXT: v_writelane_b32 v23, s38, 7
-; GFX11-NEXT: v_writelane_b32 v23, s39, 8
-; GFX11-NEXT: v_writelane_b32 v23, s48, 9
-; GFX11-NEXT: v_writelane_b32 v23, s49, 10
-; GFX11-NEXT: v_writelane_b32 v23, s50, 11
-; GFX11-NEXT: v_writelane_b32 v23, s51, 12
-; GFX11-NEXT: v_writelane_b32 v23, s52, 13
-; GFX11-NEXT: v_writelane_b32 v23, s53, 14
-; GFX11-NEXT: v_writelane_b32 v23, s54, 15
-; GFX11-NEXT: v_writelane_b32 v23, s55, 16
+; GFX11-NEXT: v_writelane_b32 v23, s35, 2
+; GFX11-NEXT: v_writelane_b32 v23, s36, 3
+; GFX11-NEXT: v_writelane_b32 v23, s37, 4
+; GFX11-NEXT: v_writelane_b32 v23, s38, 5
+; GFX11-NEXT: v_writelane_b32 v23, s39, 6
+; GFX11-NEXT: v_writelane_b32 v23, s48, 7
+; GFX11-NEXT: v_writelane_b32 v23, s49, 8
+; GFX11-NEXT: v_writelane_b32 v23, s50, 9
+; GFX11-NEXT: v_writelane_b32 v23, s51, 10
+; GFX11-NEXT: v_writelane_b32 v23, s52, 11
+; GFX11-NEXT: v_writelane_b32 v23, s53, 12
+; GFX11-NEXT: v_writelane_b32 v23, s54, 13
+; GFX11-NEXT: v_writelane_b32 v23, s55, 14
+; GFX11-NEXT: v_writelane_b32 v23, s30, 15
+; GFX11-NEXT: v_writelane_b32 v23, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v23, 16
-; GFX11-NEXT: v_readlane_b32 s54, v23, 15
-; GFX11-NEXT: v_readlane_b32 s53, v23, 14
-; GFX11-NEXT: v_readlane_b32 s52, v23, 13
-; GFX11-NEXT: v_readlane_b32 s51, v23, 12
-; GFX11-NEXT: v_readlane_b32 s50, v23, 11
-; GFX11-NEXT: v_readlane_b32 s49, v23, 10
-; GFX11-NEXT: v_readlane_b32 s48, v23, 9
-; GFX11-NEXT: v_readlane_b32 s39, v23, 8
-; GFX11-NEXT: v_readlane_b32 s38, v23, 7
-; GFX11-NEXT: v_readlane_b32 s37, v23, 6
-; GFX11-NEXT: v_readlane_b32 s36, v23, 5
-; GFX11-NEXT: v_readlane_b32 s35, v23, 4
-; GFX11-NEXT: v_readlane_b32 s34, v23, 3
-; GFX11-NEXT: v_readlane_b32 s33, v23, 2
-; GFX11-NEXT: v_readlane_b32 s31, v23, 1
-; GFX11-NEXT: v_readlane_b32 s30, v23, 0
+; GFX11-NEXT: v_readlane_b32 s30, v23, 15
+; GFX11-NEXT: v_readlane_b32 s31, v23, 16
+; GFX11-NEXT: v_readlane_b32 s55, v23, 14
+; GFX11-NEXT: v_readlane_b32 s54, v23, 13
+; GFX11-NEXT: v_readlane_b32 s53, v23, 12
+; GFX11-NEXT: v_readlane_b32 s52, v23, 11
+; GFX11-NEXT: v_readlane_b32 s51, v23, 10
+; GFX11-NEXT: v_readlane_b32 s50, v23, 9
+; GFX11-NEXT: v_readlane_b32 s49, v23, 8
+; GFX11-NEXT: v_readlane_b32 s48, v23, 7
+; GFX11-NEXT: v_readlane_b32 s39, v23, 6
+; GFX11-NEXT: v_readlane_b32 s38, v23, 5
+; GFX11-NEXT: v_readlane_b32 s37, v23, 4
+; GFX11-NEXT: v_readlane_b32 s36, v23, 3
+; GFX11-NEXT: v_readlane_b32 s35, v23, 2
+; GFX11-NEXT: v_readlane_b32 s34, v23, 1
+; GFX11-NEXT: v_readlane_b32 s33, v23, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
; GFX11-NEXT: scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
@@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v23, s30, 0
+; GFX12-NEXT: v_writelane_b32 v23, s33, 0
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_writelane_b32 v23, s31, 1
-; GFX12-NEXT: v_writelane_b32 v23, s33, 2
-; GFX12-NEXT: v_writelane_b32 v23, s34, 3
-; GFX12-NEXT: v_writelane_b32 v23, s35, 4
-; GFX12-NEXT: v_writelane_b32 v23, s36, 5
-; GFX12-NEXT: v_writelane_b32 v23, s37, 6
-; GFX12-NEXT: v_writelane_b32 v23, s38, 7
-; GFX12-NEXT: v_writelane_b32 v23, s39, 8
-; GFX12-NEXT: v_writelane_b32 v23, s48, 9
-; GFX12-NEXT: v_writelane_b32 v23, s49, 10
-; GFX12-NEXT: v_writelane_b32 v23, s50, 11
-; GFX12-NEXT: v_writelane_b32 v23, s51, 12
-; GFX12-NEXT: v_writelane_b32 v23, s52, 13
-; GFX12-NEXT: v_writelane_b32 v23, s53, 14
-; GFX12-NEXT: v_writelane_b32 v23, s54, 15
-; GFX12-NEXT: v_writelane_b32 v23, s55, 16
+; GFX12-NEXT: v_writelane_b32 v23, s34, 1
+; GFX12-NEXT: v_writelane_b32 v23, s35, 2
+; GFX12-NEXT: v_writelane_b32 v23, s36, 3
+; GFX12-NEXT: v_writelane_b32 v23, s37, 4
+; GFX12-NEXT: v_writelane_b32 v23, s38, 5
+; GFX12-NEXT: v_writelane_b32 v23, s39, 6
+; GFX12-NEXT: v_writelane_b32 v23, s48, 7
+; GFX12-NEXT: v_writelane_b32 v23, s49, 8
+; GFX12-NEXT: v_writelane_b32 v23, s50, 9
+; GFX12-NEXT: v_writelane_b32 v23, s51, 10
+; GFX12-NEXT: v_writelane_b32 v23, s52, 11
+; GFX12-NEXT: v_writelane_b32 v23, s53, 12
+; GFX12-NEXT: v_writelane_b32 v23, s54, 13
+; GFX12-NEXT: v_writelane_b32 v23, s55, 14
+; GFX12-NEXT: v_writelane_b32 v23, s30, 15
+; GFX12-NEXT: v_writelane_b32 v23, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v23, 16
-; GFX12-NEXT: v_readlane_b32 s54, v23, 15
-; GFX12-NEXT: v_readlane_b32 s53, v23, 14
-; GFX12-NEXT: v_readlane_b32 s52, v23, 13
-; GFX12-NEXT: v_readlane_b32 s51, v23, 12
-; GFX12-NEXT: v_readlane_b32 s50, v23, 11
-; GFX12-NEXT: v_readlane_b32 s49, v23, 10
-; GFX12-NEXT: v_readlane_b32 s48, v23, 9
-; GFX12-NEXT: v_readlane_b32 s39, v23, 8
-; GFX12-NEXT: v_readlane_b32 s38, v23, 7
-; GFX12-NEXT: v_readlane_b32 s37, v23, 6
-; GFX12-NEXT: v_readlane_b32 s36, v23, 5
-; GFX12-NEXT: v_readlane_b32 s35, v23, 4
-; GFX12-NEXT: v_readlane_b32 s34, v23, 3
-; GFX12-NEXT: v_readlane_b32 s33, v23, 2
-; GFX12-NEXT: v_readlane_b32 s31, v23, 1
-; GFX12-NEXT: v_readlane_b32 s30, v23, 0
+; GFX12-NEXT: v_readlane_b32 s30, v23, 15
+; GFX12-NEXT: v_readlane_b32 s31, v23, 16
+; GFX12-NEXT: v_readlane_b32 s55, v23, 14
+; GFX12-NEXT: v_readlane_b32 s54, v23, 13
+; GFX12-NEXT: v_readlane_b32 s53, v23, 12
+; GFX12-NEXT: v_readlane_b32 s52, v23, 11
+; GFX12-NEXT: v_readlane_b32 s51, v23, 10
+; GFX12-NEXT: v_readlane_b32 s50, v23, 9
+; GFX12-NEXT: v_readlane_b32 s49, v23, 8
+; GFX12-NEXT: v_readlane_b32 s48, v23, 7
+; GFX12-NEXT: v_readlane_b32 s39, v23, 6
+; GFX12-NEXT: v_readlane_b32 s38, v23, 5
+; GFX12-NEXT: v_readlane_b32 s37, v23, 4
+; GFX12-NEXT: v_readlane_b32 s36, v23, 3
+; GFX12-NEXT: v_readlane_b32 s35, v23, 2
+; GFX12-NEXT: v_readlane_b32 s34, v23, 1
+; GFX12-NEXT: v_readlane_b32 s33, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: s_add_i32 s6, s32, 0x100400
; GFX7-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v21, s30, 0
-; GFX7-NEXT: v_writelane_b32 v21, s31, 1
-; GFX7-NEXT: v_writelane_b32 v21, s33, 2
-; GFX7-NEXT: v_writelane_b32 v21, s34, 3
-; GFX7-NEXT: v_writelane_b32 v21, s35, 4
-; GFX7-NEXT: v_writelane_b32 v21, s36, 5
-; GFX7-NEXT: v_writelane_b32 v21, s37, 6
-; GFX7-NEXT: v_writelane_b32 v21, s38, 7
-; GFX7-NEXT: v_writelane_b32 v21, s39, 8
-; GFX7-NEXT: v_writelane_b32 v21, s48, 9
-; GFX7-NEXT: v_writelane_b32 v21, s49, 10
-; GFX7-NEXT: v_writelane_b32 v21, s50, 11
-; GFX7-NEXT: v_writelane_b32 v21, s51, 12
-; GFX7-NEXT: v_writelane_b32 v21, s52, 13
-; GFX7-NEXT: v_writelane_b32 v21, s53, 14
-; GFX7-NEXT: v_writelane_b32 v21, s54, 15
+; GFX7-NEXT: v_writelane_b32 v21, s33, 0
+; GFX7-NEXT: v_writelane_b32 v21, s34, 1
+; GFX7-NEXT: v_writelane_b32 v21, s35, 2
+; GFX7-NEXT: v_writelane_b32 v21, s36, 3
+; GFX7-NEXT: v_writelane_b32 v21, s37, 4
+; GFX7-NEXT: v_writelane_b32 v21, s38, 5
+; GFX7-NEXT: v_writelane_b32 v21, s39, 6
+; GFX7-NEXT: v_writelane_b32 v21, s48, 7
+; GFX7-NEXT: v_writelane_b32 v21, s49, 8
+; GFX7-NEXT: v_writelane_b32 v21, s50, 9
+; GFX7-NEXT: v_writelane_b32 v21, s51, 10
+; GFX7-NEXT: v_writelane_b32 v21, s52, 11
+; GFX7-NEXT: v_writelane_b32 v21, s53, 12
+; GFX7-NEXT: v_writelane_b32 v21, s54, 13
+; GFX7-NEXT: v_writelane_b32 v21, s55, 14
+; GFX7-NEXT: v_writelane_b32 v21, s30, 15
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v21, s55, 16
+; GFX7-NEXT: v_writelane_b32 v21, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX7-NEXT: ;;#ASMEND
@@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v21, 16
-; GFX7-NEXT: v_readlane_b32 s54, v21, 15
-; GFX7-NEXT: v_readlane_b32 s53, v21, 14
-; GFX7-NEXT: v_readlane_b32 s52, v21, 13
-; GFX7-NEXT: v_readlane_b32 s51, v21, 12
-; GFX7-NEXT: v_readlane_b32 s50, v21, 11
-; GFX7-NEXT: v_readlane_b32 s49, v21, 10
-; GFX7-NEXT: v_readlane_b32 s48, v21, 9
-; GFX7-NEXT: v_readlane_b32 s39, v21, 8
-; GFX7-NEXT: v_readlane_b32 s38, v21, 7
-; GFX7-NEXT: v_readlane_b32 s37, v21, 6
-; GFX7-NEXT: v_readlane_b32 s36, v21, 5
-; GFX7-NEXT: v_readlane_b32 s35, v21, 4
-; GFX7-NEXT: v_readlane_b32 s34, v21, 3
-; GFX7-NEXT: v_readlane_b32 s33, v21, 2
-; GFX7-NEXT: v_readlane_b32 s31, v21, 1
-; GFX7-NEXT: v_readlane_b32 s30, v21, 0
+; GFX7-NEXT: v_readlane_b32 s30, v21, 15
+; GFX7-NEXT: v_readlane_b32 s31, v21, 16
+; GFX7-NEXT: v_readlane_b32 s55, v21, 14
+; GFX7-NEXT: v_readlane_b32 s54, v21, 13
+; GFX7-NEXT: v_readlane_b32 s53, v21, 12
+; GFX7-NEXT: v_readlane_b32 s52, v21, 11
+; GFX7-NEXT: v_readlane_b32 s51, v21, 10
+; GFX7-NEXT: v_readlane_b32 s50, v21, 9
+; GFX7-NEXT: v_readlane_b32 s49, v21, 8
+; GFX7-NEXT: v_readlane_b32 s48, v21, 7
+; GFX7-NEXT: v_readlane_b32 s39, v21, 6
+; GFX7-NEXT: v_readlane_b32 s38, v21, 5
+; GFX7-NEXT: v_readlane_b32 s37, v21, 4
+; GFX7-NEXT: v_readlane_b32 s36, v21, 3
+; GFX7-NEXT: v_readlane_b32 s35, v21, 2
+; GFX7-NEXT: v_readlane_b32 s34, v21, 1
+; GFX7-NEXT: v_readlane_b32 s33, v21, 0
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x100400
; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: s_add_i32 s6, s32, 0x100400
; GFX8-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v21, s30, 0
-; GFX8-NEXT: v_writelane_b32 v21, s31, 1
-; GFX8-NEXT: v_writelane_b32 v21, s33, 2
-; GFX8-NEXT: v_writelane_b32 v21, s34, 3
-; GFX8-NEXT: v_writelane_b32 v21, s35, 4
-; GFX8-NEXT: v_writelane_b32 v21, s36, 5
-; GFX8-NEXT: v_writelane_b32 v21, s37, 6
-; GFX8-NEXT: v_writelane_b32 v21, s38, 7
-; GFX8-NEXT: v_writelane_b32 v21, s39, 8
-; GFX8-NEXT: v_writelane_b32 v21, s48, 9
-; GFX8-NEXT: v_writelane_b32 v21, s49, 10
-; GFX8-NEXT: v_writelane_b32 v21, s50, 11
-; GFX8-NEXT: v_writelane_b32 v21, s51, 12
-; GFX8-NEXT: v_writelane_b32 v21, s52, 13
-; GFX8-NEXT: v_writelane_b32 v21, s53, 14
-; GFX8-NEXT: v_writelane_b32 v21, s54, 15
+; GFX8-NEXT: v_writelane_b32 v21, s33, 0
+; GFX8-NEXT: v_writelane_b32 v21, s34, 1
+; GFX8-NEXT: v_writelane_b32 v21, s35, 2
+; GFX8-NEXT: v_writelane_b32 v21, s36, 3
+; GFX8-NEXT: v_writelane_b32 v21, s37, 4
+; GFX8-NEXT: v_writelane_b32 v21, s38, 5
+; GFX8-NEXT: v_writelane_b32 v21, s39, 6
+; GFX8-NEXT: v_writelane_b32 v21, s48, 7
+; GFX8-NEXT: v_writelane_b32 v21, s49, 8
+; GFX8-NEXT: v_writelane_b32 v21, s50, 9
+; GFX8-NEXT: v_writelane_b32 v21, s51, 10
+; GFX8-NEXT: v_writelane_b32 v21, s52, 11
+; GFX8-NEXT: v_writelane_b32 v21, s53, 12
+; GFX8-NEXT: v_writelane_b32 v21, s54, 13
+; GFX8-NEXT: v_writelane_b32 v21, s55, 14
+; GFX8-NEXT: v_writelane_b32 v21, s30, 15
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v21, s55, 16
+; GFX8-NEXT: v_writelane_b32 v21, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
@@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v21, 16
-; GFX8-NEXT: v_readlane_b32 s54, v21, 15
-; GFX8-NEXT: v_readlane_b32 s53, v21, 14
-; GFX8-NEXT: v_readlane_b32 s52, v21, 13
-; GFX8-NEXT: v_readlane_b32 s51, v21, 12
-; GFX8-NEXT: v_readlane_b32 s50, v21, 11
-; GFX8-NEXT: v_readlane_b32 s49, v21, 10
-; GFX8-NEXT: v_readlane_b32 s48, v21, 9
-; GFX8-NEXT: v_readlane_b32 s39, v21, 8
-; GFX8-NEXT: v_readlane_b32 s38, v21, 7
-; GFX8-NEXT: v_readlane_b32 s37, v21, 6
-; GFX8-NEXT: v_readlane_b32 s36, v21, 5
-; GFX8-NEXT: v_readlane_b32 s35, v21, 4
-; GFX8-NEXT: v_readlane_b32 s34, v21, 3
-; GFX8-NEXT: v_readlane_b32 s33, v21, 2
-; GFX8-NEXT: v_readlane_b32 s31, v21, 1
-; GFX8-NEXT: v_readlane_b32 s30, v21, 0
+; GFX8-NEXT: v_readlane_b32 s30, v21, 15
+; GFX8-NEXT: v_readlane_b32 s31, v21, 16
+; GFX8-NEXT: v_readlane_b32 s55, v21, 14
+; GFX8-NEXT: v_readlane_b32 s54, v21, 13
+; GFX8-NEXT: v_readlane_b32 s53, v21, 12
+; GFX8-NEXT: v_readlane_b32 s52, v21, 11
+; GFX8-NEXT: v_readlane_b32 s51, v21, 10
+; GFX8-NEXT: v_readlane_b32 s50, v21, 9
+; GFX8-NEXT: v_readlane_b32 s49, v21, 8
+; GFX8-NEXT: v_readlane_b32 s48, v21, 7
+; GFX8-NEXT: v_readlane_b32 s39, v21, 6
+; GFX8-NEXT: v_readlane_b32 s38, v21, 5
+; GFX8-NEXT: v_readlane_b32 s37, v21, 4
+; GFX8-NEXT: v_readlane_b32 s36, v21, 3
+; GFX8-NEXT: v_readlane_b32 s35, v21, 2
+; GFX8-NEXT: v_readlane_b32 s34, v21, 1
+; GFX8-NEXT: v_readlane_b32 s33, v21, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x100400
; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: s_add_i32 s6, s32, 0x100400
; GFX900-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v21, s30, 0
-; GFX900-NEXT: v_writelane_b32 v21, s31, 1
-; GFX900-NEXT: v_writelane_b32 v21, s33, 2
-; GFX900-NEXT: v_writelane_b32 v21, s34, 3
-; GFX900-NEXT: v_writelane_b32 v21, s35, 4
-; GFX900-NEXT: v_writelane_b32 v21, s36, 5
-; GFX900-NEXT: v_writelane_b32 v21, s37, 6
-; GFX900-NEXT: v_writelane_b32 v21, s38, 7
-; GFX900-NEXT: v_writelane_b32 v21, s39, 8
-; GFX900-NEXT: v_writelane_b32 v21, s48, 9
-; GFX900-NEXT: v_writelane_b32 v21, s49, 10
-; GFX900-NEXT: v_writelane_b32 v21, s50, 11
-; GFX900-NEXT: v_writelane_b32 v21, s51, 12
-; GFX900-NEXT: v_writelane_b32 v21, s52, 13
-; GFX900-NEXT: v_writelane_b32 v21, s53, 14
-; GFX900-NEXT: v_writelane_b32 v21, s54, 15
+; GFX900-NEXT: v_writelane_b32 v21, s33, 0
+; GFX900-NEXT: v_writelane_b32 v21, s34, 1
+; GFX900-NEXT: v_writelane_b32 v21, s35, 2
+; GFX900-NEXT: v_writelane_b32 v21, s36, 3
+; GFX900-NEXT: v_writelane_b32 v21, s37, 4
+; GFX900-NEXT: v_writelane_b32 v21, s38, 5
+; GFX900-NEXT: v_writelane_b32 v21, s39, 6
+; GFX900-NEXT: v_writelane_b32 v21, s48, 7
+; GFX900-NEXT: v_writelane_b32 v21, s49, 8
+; GFX900-NEXT: v_writelane_b32 v21, s50, 9
+; GFX900-NEXT: v_writelane_b32 v21, s51, 10
+; GFX900-NEXT: v_writelane_b32 v21, s52, 11
+; GFX900-NEXT: v_writelane_b32 v21, s53, 12
+; GFX900-NEXT: v_writelane_b32 v21, s54, 13
+; GFX900-NEXT: v_writelane_b32 v21, s55, 14
+; GFX900-NEXT: v_writelane_b32 v21, s30, 15
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v21, s55, 16
+; GFX900-NEXT: v_writelane_b32 v21, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX900-NEXT: ;;#ASMEND
@@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v21, 16
-; GFX900-NEXT: v_readlane_b32 s54, v21, 15
-; GFX900-NEXT: v_readlane_b32 s53, v21, 14
-; GFX900-NEXT: v_readlane_b32 s52, v21, 13
-; GFX900-NEXT: v_readlane_b32 s51, v21, 12
-; GFX900-NEXT: v_readlane_b32 s50, v21, 11
-; GFX900-NEXT: v_readlane_b32 s49, v21, 10
-; GFX900-NEXT: v_readlane_b32 s48, v21, 9
-; GFX900-NEXT: v_readlane_b32 s39, v21, 8
-; GFX900-NEXT: v_readlane_b32 s38, v21, 7
-; GFX900-NEXT: v_readlane_b32 s37, v21, 6
-; GFX900-NEXT: v_readlane_b32 s36, v21, 5
-; GFX900-NEXT: v_readlane_b32 s35, v21, 4
-; GFX900-NEXT: v_readlane_b32 s34, v21, 3
-; GFX900-NEXT: v_readlane_b32 s33, v21, 2
-; GFX900-NEXT: v_readlane_b32 s31, v21, 1
-; GFX900-NEXT: v_readlane_b32 s30, v21, 0
+; GFX900-NEXT: v_readlane_b32 s30, v21, 15
+; GFX900-NEXT: v_readlane_b32 s31, v21, 16
+; GFX900-NEXT: v_readlane_b32 s55, v21, 14
+; GFX900-NEXT: v_readlane_b32 s54, v21, 13
+; GFX900-NEXT: v_readlane_b32 s53, v21, 12
+; GFX900-NEXT: v_readlane_b32 s52, v21, 11
+; GFX900-NEXT: v_readlane_b32 s51, v21, 10
+; GFX900-NEXT: v_readlane_b32 s50, v21, 9
+; GFX900-NEXT: v_readlane_b32 s49, v21, 8
+; GFX900-NEXT: v_readlane_b32 s48, v21, 7
+; GFX900-NEXT: v_readlane_b32 s39, v21, 6
+; GFX900-NEXT: v_readlane_b32 s38, v21, 5
+; GFX900-NEXT: v_readlane_b32 s37, v21, 4
+; GFX900-NEXT: v_readlane_b32 s36, v21, 3
+; GFX900-NEXT: v_readlane_b32 s35, v21, 2
+; GFX900-NEXT: v_readlane_b32 s34, v21, 1
+; GFX900-NEXT: v_readlane_b32 s33, v21, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x100400
; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: s_add_i32 s2, s32, 0x4010
; GFX942-NEXT: scratch_store_dword off, v21, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v21, s30, 0
-; GFX942-NEXT: v_writelane_b32 v21, s31, 1
-; GFX942-NEXT: v_writelane_b32 v21, s33, 2
-; GFX942-NEXT: v_writelane_b32 v21, s34, 3
-; GFX942-NEXT: v_writelane_b32 v21, s35, 4
-; GFX942-NEXT: v_writelane_b32 v21, s36, 5
-; GFX942-NEXT: v_writelane_b32 v21, s37, 6
-; GFX942-NEXT: v_writelane_b32 v21, s38, 7
-; GFX942-NEXT: v_writelane_b32 v21, s39, 8
-; GFX942-NEXT: v_writelane_b32 v21, s48, 9
-; GFX942-NEXT: v_writelane_b32 v21, s49, 10
-; GFX942-NEXT: v_writelane_b32 v21, s50, 11
-; GFX942-NEXT: v_writelane_b32 v21, s51, 12
-; GFX942-NEXT: v_writelane_b32 v21, s52, 13
-; GFX942-NEXT: v_writelane_b32 v21, s53, 14
-; GFX942-NEXT: v_writelane_b32 v21, s54, 15
+; GFX942-NEXT: v_writelane_b32 v21, s33, 0
+; GFX942-NEXT: v_writelane_b32 v21, s34, 1
+; GFX942-NEXT: v_writelane_b32 v21, s35, 2
+; GFX942-NEXT: v_writelane_b32 v21, s36, 3
+; GFX942-NEXT: v_writelane_b32 v21, s37, 4
+; GFX942-NEXT: v_writelane_b32 v21, s38, 5
+; GFX942-NEXT: v_writelane_b32 v21, s39, 6
+; GFX942-NEXT: v_writelane_b32 v21, s48, 7
+; GFX942-NEXT: v_writelane_b32 v21, s49, 8
+; GFX942-NEXT: v_writelane_b32 v21, s50, 9
+; GFX942-NEXT: v_writelane_b32 v21, s51, 10
+; GFX942-NEXT: v_writelane_b32 v21, s52, 11
+; GFX942-NEXT: v_writelane_b32 v21, s53, 12
+; GFX942-NEXT: v_writelane_b32 v21, s54, 13
+; GFX942-NEXT: v_writelane_b32 v21, s55, 14
+; GFX942-NEXT: v_writelane_b32 v21, s30, 15
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: v_writelane_b32 v21, s55, 16
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_writelane_b32 v21, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v21, 16
-; GFX942-NEXT: v_readlane_b32 s54, v21, 15
-; GFX942-NEXT: v_readlane_b32 s53, v21, 14
-; GFX942-NEXT: v_readlane_b32 s52, v21, 13
-; GFX942-NEXT: v_readlane_b32 s51, v21, 12
-; GFX942-NEXT: v_readlane_b32 s50, v21, 11
-; GFX942-NEXT: v_readlane_b32 s49, v21, 10
-; GFX942-NEXT: v_readlane_b32 s48, v21, 9
-; GFX942-NEXT: v_readlane_b32 s39, v21, 8
-; GFX942-NEXT: v_readlane_b32 s38, v21, 7
-; GFX942-NEXT: v_readlane_b32 s37, v21, 6
-; GFX942-NEXT: v_readlane_b32 s36, v21, 5
-; GFX942-NEXT: v_readlane_b32 s35, v21, 4
-; GFX942-NEXT: v_readlane_b32 s34, v21, 3
-; GFX942-NEXT: v_readlane_b32 s33, v21, 2
-; GFX942-NEXT: v_readlane_b32 s31, v21, 1
-; GFX942-NEXT: v_readlane_b32 s30, v21, 0
+; GFX942-NEXT: v_readlane_b32 s30, v21, 15
+; GFX942-NEXT: v_readlane_b32 s31, v21, 16
+; GFX942-NEXT: v_readlane_b32 s55, v21, 14
+; GFX942-NEXT: v_readlane_b32 s54, v21, 13
+; GFX942-NEXT: v_readlane_b32 s53, v21, 12
+; GFX942-NEXT: v_readlane_b32 s52, v21, 11
+; GFX942-NEXT: v_readlane_b32 s51, v21, 10
+; GFX942-NEXT: v_readlane_b32 s50, v21, 9
+; GFX942-NEXT: v_readlane_b32 s49, v21, 8
+; GFX942-NEXT: v_readlane_b32 s48, v21, 7
+; GFX942-NEXT: v_readlane_b32 s39, v21, 6
+; GFX942-NEXT: v_readlane_b32 s38, v21, 5
+; GFX942-NEXT: v_readlane_b32 s37, v21, 4
+; GFX942-NEXT: v_readlane_b32 s36, v21, 3
+; GFX942-NEXT: v_readlane_b32 s35, v21, 2
+; GFX942-NEXT: v_readlane_b32 s34, v21, 1
+; GFX942-NEXT: v_readlane_b32 s33, v21, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x4010
; GFX942-NEXT: scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
@@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v21, s33, 0
; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX10_1-NEXT: v_writelane_b32 v21, s31, 1
-; GFX10_1-NEXT: v_writelane_b32 v21, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v21, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v21, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v21, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v21, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v21, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v21, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v21, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v21, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v21, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v21, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v21, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v21, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v21, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v21, s34, 1
+; GFX10_1-NEXT: v_writelane_b32 v21, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v21, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v21, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v21, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v21, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v21, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v21, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v21, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v21, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v21, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v21, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v21, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v21, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v21, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v21, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v21, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v21, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v21, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v21, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v21, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v21, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v21, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v21, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v21, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v21, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v21, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v21, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v21, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v21, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v21, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v21, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v21, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v21, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v21, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v21, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v21, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v21, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v21, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v21, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v21, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v21, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v21, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v21, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v21, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v21, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v21, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v21, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v21, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_1-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_3-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v21, s33, 0
; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX10_3-NEXT: v_writelane_b32 v21, s31, 1
-; GFX10_3-NEXT: v_writelane_b32 v21, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v21, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v21, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v21, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v21, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v21, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v21, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v21, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v21, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v21, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v21, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v21, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v21, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v21, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v21, s34, 1
+; GFX10_3-NEXT: v_writelane_b32 v21, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v21, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v21, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v21, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v21, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v21, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v21, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v21, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v21, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v21, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v21, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v21, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v21, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v21, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v21, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v21, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v21, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v21, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v21, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v21, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v21, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v21, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v21, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v21, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v21, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v21, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v21, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v21, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v21, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v21, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v21, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v21, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v21, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v21, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v21, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v21, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v21, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v21, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v21, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v21, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v21, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v21, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v21, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v21, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v21, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v21, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v21, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v21, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_3-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: s_add_i32 s1, s32, 0x4010
; GFX11-NEXT: scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v21, s30, 0
+; GFX11-NEXT: v_writelane_b32 v21, s33, 0
; GFX11-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v21, s31, 1
-; GFX11-NEXT: v_writelane_b32 v21, s33, 2
-; GFX11-NEXT: v_writelane_b32 v21, s34, 3
-; GFX11-NEXT: v_writelane_b32 v21, s35, 4
-; GFX11-NEXT: v_writelane_b32 v21, s36, 5
-; GFX11-NEXT: v_writelane_b32 v21, s37, 6
-; GFX11-NEXT: v_writelane_b32 v21, s38, 7
-; GFX11-NEXT: v_writelane_b32 v21, s39, 8
-; GFX11-NEXT: v_writelane_b32 v21, s48, 9
-; GFX11-NEXT: v_writelane_b32 v21, s49, 10
-; GFX11-NEXT: v_writelane_b32 v21, s50, 11
-; GFX11-NEXT: v_writelane_b32 v21, s51, 12
-; GFX11-NEXT: v_writelane_b32 v21, s52, 13
-; GFX11-NEXT: v_writelane_b32 v21, s53, 14
-; GFX11-NEXT: v_writelane_b32 v21, s54, 15
-; GFX11-NEXT: v_writelane_b32 v21, s55, 16
+; GFX11-NEXT: v_writelane_b32 v21, s34, 1
+; GFX11-NEXT: v_writelane_b32 v21, s35, 2
+; GFX11-NEXT: v_writelane_b32 v21, s36, 3
+; GFX11-NEXT: v_writelane_b32 v21, s37, 4
+; GFX11-NEXT: v_writelane_b32 v21, s38, 5
+; GFX11-NEXT: v_writelane_b32 v21, s39, 6
+; GFX11-NEXT: v_writelane_b32 v21, s48, 7
+; GFX11-NEXT: v_writelane_b32 v21, s49, 8
+; GFX11-NEXT: v_writelane_b32 v21, s50, 9
+; GFX11-NEXT: v_writelane_b32 v21, s51, 10
+; GFX11-NEXT: v_writelane_b32 v21, s52, 11
+; GFX11-NEXT: v_writelane_b32 v21, s53, 12
+; GFX11-NEXT: v_writelane_b32 v21, s54, 13
+; GFX11-NEXT: v_writelane_b32 v21, s55, 14
+; GFX11-NEXT: v_writelane_b32 v21, s30, 15
+; GFX11-NEXT: v_writelane_b32 v21, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v21, 16
-; GFX11-NEXT: v_readlane_b32 s54, v21, 15
-; GFX11-NEXT: v_readlane_b32 s53, v21, 14
-; GFX11-NEXT: v_readlane_b32 s52, v21, 13
-; GFX11-NEXT: v_readlane_b32 s51, v21, 12
-; GFX11-NEXT: v_readlane_b32 s50, v21, 11
-; GFX11-NEXT: v_readlane_b32 s49, v21, 10
-; GFX11-NEXT: v_readlane_b32 s48, v21, 9
-; GFX11-NEXT: v_readlane_b32 s39, v21, 8
-; GFX11-NEXT: v_readlane_b32 s38, v21, 7
-; GFX11-NEXT: v_readlane_b32 s37, v21, 6
-; GFX11-NEXT: v_readlane_b32 s36, v21, 5
-; GFX11-NEXT: v_readlane_b32 s35, v21, 4
-; GFX11-NEXT: v_readlane_b32 s34, v21, 3
-; GFX11-NEXT: v_readlane_b32 s33, v21, 2
-; GFX11-NEXT: v_readlane_b32 s31, v21, 1
-; GFX11-NEXT: v_readlane_b32 s30, v21, 0
+; GFX11-NEXT: v_readlane_b32 s30, v21, 15
+; GFX11-NEXT: v_readlane_b32 s31, v21, 16
+; GFX11-NEXT: v_readlane_b32 s55, v21, 14
+; GFX11-NEXT: v_readlane_b32 s54, v21, 13
+; GFX11-NEXT: v_readlane_b32 s53, v21, 12
+; GFX11-NEXT: v_readlane_b32 s52, v21, 11
+; GFX11-NEXT: v_readlane_b32 s51, v21, 10
+; GFX11-NEXT: v_readlane_b32 s50, v21, 9
+; GFX11-NEXT: v_readlane_b32 s49, v21, 8
+; GFX11-NEXT: v_readlane_b32 s48, v21, 7
+; GFX11-NEXT: v_readlane_b32 s39, v21, 6
+; GFX11-NEXT: v_readlane_b32 s38, v21, 5
+; GFX11-NEXT: v_readlane_b32 s37, v21, 4
+; GFX11-NEXT: v_readlane_b32 s36, v21, 3
+; GFX11-NEXT: v_readlane_b32 s35, v21, 2
+; GFX11-NEXT: v_readlane_b32 s34, v21, 1
+; GFX11-NEXT: v_readlane_b32 s33, v21, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x4010
; GFX11-NEXT: scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
@@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v21, s30, 0
+; GFX12-NEXT: v_writelane_b32 v21, s33, 0
; GFX12-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX12-NEXT: v_writelane_b32 v21, s31, 1
-; GFX12-NEXT: v_writelane_b32 v21, s33, 2
-; GFX12-NEXT: v_writelane_b32 v21, s34, 3
-; GFX12-NEXT: v_writelane_b32 v21, s35, 4
-; GFX12-NEXT: v_writelane_b32 v21, s36, 5
-; GFX12-NEXT: v_writelane_b32 v21, s37, 6
-; GFX12-NEXT: v_writelane_b32 v21, s38, 7
-; GFX12-NEXT: v_writelane_b32 v21, s39, 8
-; GFX12-NEXT: v_writelane_b32 v21, s48, 9
-; GFX12-NEXT: v_writelane_b32 v21, s49, 10
-; GFX12-NEXT: v_writelane_b32 v21, s50, 11
-; GFX12-NEXT: v_writelane_b32 v21, s51, 12
-; GFX12-NEXT: v_writelane_b32 v21, s52, 13
-; GFX12-NEXT: v_writelane_b32 v21, s53, 14
-; GFX12-NEXT: v_writelane_b32 v21, s54, 15
-; GFX12-NEXT: v_writelane_b32 v21, s55, 16
+; GFX12-NEXT: v_writelane_b32 v21, s34, 1
+; GFX12-NEXT: v_writelane_b32 v21, s35, 2
+; GFX12-NEXT: v_writelane_b32 v21, s36, 3
+; GFX12-NEXT: v_writelane_b32 v21, s37, 4
+; GFX12-NEXT: v_writelane_b32 v21, s38, 5
+; GFX12-NEXT: v_writelane_b32 v21, s39, 6
+; GFX12-NEXT: v_writelane_b32 v21, s48, 7
+; GFX12-NEXT: v_writelane_b32 v21, s49, 8
+; GFX12-NEXT: v_writelane_b32 v21, s50, 9
+; GFX12-NEXT: v_writelane_b32 v21, s51, 10
+; GFX12-NEXT: v_writelane_b32 v21, s52, 11
+; GFX12-NEXT: v_writelane_b32 v21, s53, 12
+; GFX12-NEXT: v_writelane_b32 v21, s54, 13
+; GFX12-NEXT: v_writelane_b32 v21, s55, 14
+; GFX12-NEXT: v_writelane_b32 v21, s30, 15
+; GFX12-NEXT: v_writelane_b32 v21, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s55, v21, 16
-; GFX12-NEXT: v_readlane_b32 s54, v21, 15
-; GFX12-NEXT: v_readlane_b32 s53, v21, 14
-; GFX12-NEXT: v_readlane_b32 s52, v21, 13
-; GFX12-NEXT: v_readlane_b32 s51, v21, 12
-; GFX12-NEXT: v_readlane_b32 s50, v21, 11
-; GFX12-NEXT: v_readlane_b32 s49, v21, 10
-; GFX12-NEXT: v_readlane_b32 s48, v21, 9
-; GFX12-NEXT: v_readlane_b32 s39, v21, 8
-; GFX12-NEXT: v_readlane_b32 s38, v21, 7
-; GFX12-NEXT: v_readlane_b32 s37, v21, 6
-; GFX12-NEXT: v_readlane_b32 s36, v21, 5
-; GFX12-NEXT: v_readlane_b32 s35, v21, 4
-; GFX12-NEXT: v_readlane_b32 s34, v21, 3
-; GFX12-NEXT: v_readlane_b32 s33, v21, 2
-; GFX12-NEXT: v_readlane_b32 s31, v21, 1
-; GFX12-NEXT: v_readlane_b32 s30, v21, 0
+; GFX12-NEXT: v_readlane_b32 s30, v21, 15
+; GFX12-NEXT: v_readlane_b32 s31, v21, 16
+; GFX12-NEXT: v_readlane_b32 s55, v21, 14
+; GFX12-NEXT: v_readlane_b32 s54, v21, 13
+; GFX12-NEXT: v_readlane_b32 s53, v21, 12
+; GFX12-NEXT: v_readlane_b32 s52, v21, 11
+; GFX12-NEXT: v_readlane_b32 s51, v21, 10
+; GFX12-NEXT: v_readlane_b32 s50, v21, 9
+; GFX12-NEXT: v_readlane_b32 s49, v21, 8
+; GFX12-NEXT: v_readlane_b32 s48, v21, 7
+; GFX12-NEXT: v_readlane_b32 s39, v21, 6
+; GFX12-NEXT: v_readlane_b32 s38, v21, 5
+; GFX12-NEXT: v_readlane_b32 s37, v21, 4
+; GFX12-NEXT: v_readlane_b32 s36, v21, 3
+; GFX12-NEXT: v_readlane_b32 s35, v21, 2
+; GFX12-NEXT: v_readlane_b32 s34, v21, 1
+; GFX12-NEXT: v_readlane_b32 s33, v21, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: v_writelane_b32 v23, s28, 17
; GFX7-NEXT: v_writelane_b32 v23, s29, 18
-; GFX7-NEXT: v_writelane_b32 v23, s30, 0
-; GFX7-NEXT: v_writelane_b32 v23, s31, 1
-; GFX7-NEXT: v_writelane_b32 v23, s33, 2
-; GFX7-NEXT: v_writelane_b32 v23, s34, 3
-; GFX7-NEXT: v_writelane_b32 v23, s35, 4
-; GFX7-NEXT: v_writelane_b32 v23, s36, 5
-; GFX7-NEXT: v_writelane_b32 v23, s37, 6
-; GFX7-NEXT: v_writelane_b32 v23, s38, 7
-; GFX7-NEXT: v_writelane_b32 v23, s39, 8
-; GFX7-NEXT: v_writelane_b32 v23, s48, 9
-; GFX7-NEXT: v_writelane_b32 v23, s49, 10
-; GFX7-NEXT: v_writelane_b32 v23, s50, 11
-; GFX7-NEXT: v_writelane_b32 v23, s51, 12
-; GFX7-NEXT: v_writelane_b32 v23, s52, 13
+; GFX7-NEXT: v_writelane_b32 v23, s33, 0
+; GFX7-NEXT: v_writelane_b32 v23, s34, 1
+; GFX7-NEXT: v_writelane_b32 v23, s35, 2
+; GFX7-NEXT: v_writelane_b32 v23, s36, 3
+; GFX7-NEXT: v_writelane_b32 v23, s37, 4
+; GFX7-NEXT: v_writelane_b32 v23, s38, 5
+; GFX7-NEXT: v_writelane_b32 v23, s39, 6
+; GFX7-NEXT: v_writelane_b32 v23, s48, 7
+; GFX7-NEXT: v_writelane_b32 v23, s49, 8
+; GFX7-NEXT: v_writelane_b32 v23, s50, 9
+; GFX7-NEXT: v_writelane_b32 v23, s51, 10
+; GFX7-NEXT: v_writelane_b32 v23, s52, 11
+; GFX7-NEXT: v_writelane_b32 v23, s53, 12
+; GFX7-NEXT: v_writelane_b32 v23, s54, 13
; GFX7-NEXT: s_lshr_b32 s5, s32, 6
-; GFX7-NEXT: v_writelane_b32 v23, s53, 14
+; GFX7-NEXT: v_writelane_b32 v23, s55, 14
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: s_add_i32 s4, s5, 0x4240
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT: v_writelane_b32 v23, s54, 15
+; GFX7-NEXT: v_writelane_b32 v23, s30, 15
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: v_writelane_b32 v22, s4, 0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s55, 16
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v23, 16
-; GFX7-NEXT: v_readlane_b32 s54, v23, 15
-; GFX7-NEXT: v_readlane_b32 s53, v23, 14
-; GFX7-NEXT: v_readlane_b32 s52, v23, 13
-; GFX7-NEXT: v_readlane_b32 s51, v23, 12
-; GFX7-NEXT: v_readlane_b32 s50, v23, 11
-; GFX7-NEXT: v_readlane_b32 s49, v23, 10
-; GFX7-NEXT: v_readlane_b32 s48, v23, 9
-; GFX7-NEXT: v_readlane_b32 s39, v23, 8
-; GFX7-NEXT: v_readlane_b32 s38, v23, 7
-; GFX7-NEXT: v_readlane_b32 s37, v23, 6
-; GFX7-NEXT: v_readlane_b32 s36, v23, 5
-; GFX7-NEXT: v_readlane_b32 s35, v23, 4
-; GFX7-NEXT: v_readlane_b32 s34, v23, 3
-; GFX7-NEXT: v_readlane_b32 s33, v23, 2
-; GFX7-NEXT: v_readlane_b32 s31, v23, 1
-; GFX7-NEXT: v_readlane_b32 s30, v23, 0
+; GFX7-NEXT: v_readlane_b32 s30, v23, 15
+; GFX7-NEXT: v_readlane_b32 s31, v23, 16
+; GFX7-NEXT: v_readlane_b32 s55, v23, 14
+; GFX7-NEXT: v_readlane_b32 s54, v23, 13
+; GFX7-NEXT: v_readlane_b32 s53, v23, 12
+; GFX7-NEXT: v_readlane_b32 s52, v23, 11
+; GFX7-NEXT: v_readlane_b32 s51, v23, 10
+; GFX7-NEXT: v_readlane_b32 s50, v23, 9
+; GFX7-NEXT: v_readlane_b32 s49, v23, 8
+; GFX7-NEXT: v_readlane_b32 s48, v23, 7
+; GFX7-NEXT: v_readlane_b32 s39, v23, 6
+; GFX7-NEXT: v_readlane_b32 s38, v23, 5
+; GFX7-NEXT: v_readlane_b32 s37, v23, 4
+; GFX7-NEXT: v_readlane_b32 s36, v23, 3
+; GFX7-NEXT: v_readlane_b32 s35, v23, 2
+; GFX7-NEXT: v_readlane_b32 s34, v23, 1
+; GFX7-NEXT: v_readlane_b32 s33, v23, 0
; GFX7-NEXT: v_readlane_b32 s28, v23, 17
; GFX7-NEXT: v_readlane_b32 s29, v23, 18
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
@@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: s_add_i32 s6, s32, 0x201100
; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v23, s30, 0
-; GFX8-NEXT: v_writelane_b32 v23, s31, 1
-; GFX8-NEXT: v_writelane_b32 v23, s33, 2
-; GFX8-NEXT: v_writelane_b32 v23, s34, 3
-; GFX8-NEXT: v_writelane_b32 v23, s35, 4
-; GFX8-NEXT: v_writelane_b32 v23, s36, 5
-; GFX8-NEXT: v_writelane_b32 v23, s37, 6
-; GFX8-NEXT: v_writelane_b32 v23, s38, 7
-; GFX8-NEXT: v_writelane_b32 v23, s39, 8
-; GFX8-NEXT: v_writelane_b32 v23, s48, 9
-; GFX8-NEXT: v_writelane_b32 v23, s49, 10
-; GFX8-NEXT: v_writelane_b32 v23, s50, 11
-; GFX8-NEXT: v_writelane_b32 v23, s51, 12
-; GFX8-NEXT: v_writelane_b32 v23, s52, 13
+; GFX8-NEXT: v_writelane_b32 v23, s33, 0
+; GFX8-NEXT: v_writelane_b32 v23, s34, 1
+; GFX8-NEXT: v_writelane_b32 v23, s35, 2
+; GFX8-NEXT: v_writelane_b32 v23, s36, 3
+; GFX8-NEXT: v_writelane_b32 v23, s37, 4
+; GFX8-NEXT: v_writelane_b32 v23, s38, 5
+; GFX8-NEXT: v_writelane_b32 v23, s39, 6
+; GFX8-NEXT: v_writelane_b32 v23, s48, 7
+; GFX8-NEXT: v_writelane_b32 v23, s49, 8
+; GFX8-NEXT: v_writelane_b32 v23, s50, 9
+; GFX8-NEXT: v_writelane_b32 v23, s51, 10
+; GFX8-NEXT: v_writelane_b32 v23, s52, 11
+; GFX8-NEXT: v_writelane_b32 v23, s53, 12
+; GFX8-NEXT: v_writelane_b32 v23, s54, 13
; GFX8-NEXT: s_lshr_b32 s5, s32, 6
-; GFX8-NEXT: v_writelane_b32 v23, s53, 14
+; GFX8-NEXT: v_writelane_b32 v23, s55, 14
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: s_add_i32 s4, s5, 0x4240
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_writelane_b32 v23, s54, 15
+; GFX8-NEXT: v_writelane_b32 v23, s30, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: v_writelane_b32 v22, s4, 0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s55, 16
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v23, 16
-; GFX8-NEXT: v_readlane_b32 s54, v23, 15
-; GFX8-NEXT: v_readlane_b32 s53, v23, 14
-; GFX8-NEXT: v_readlane_b32 s52, v23, 13
-; GFX8-NEXT: v_readlane_b32 s51, v23, 12
-; GFX8-NEXT: v_readlane_b32 s50, v23, 11
-; GFX8-NEXT: v_readlane_b32 s49, v23, 10
-; GFX8-NEXT: v_readlane_b32 s48, v23, 9
-; GFX8-NEXT: v_readlane_b32 s39, v23, 8
-; GFX8-NEXT: v_readlane_b32 s38, v23, 7
-; GFX8-NEXT: v_readlane_b32 s37, v23, 6
-; GFX8-NEXT: v_readlane_b32 s36, v23, 5
-; GFX8-NEXT: v_readlane_b32 s35, v23, 4
-; GFX8-NEXT: v_readlane_b32 s34, v23, 3
-; GFX8-NEXT: v_readlane_b32 s33, v23, 2
-; GFX8-NEXT: v_readlane_b32 s31, v23, 1
-; GFX8-NEXT: v_readlane_b32 s30, v23, 0
+; GFX8-NEXT: v_readlane_b32 s30, v23, 15
+; GFX8-NEXT: v_readlane_b32 s31, v23, 16
+; GFX8-NEXT: v_readlane_b32 s55, v23, 14
+; GFX8-NEXT: v_readlane_b32 s54, v23, 13
+; GFX8-NEXT: v_readlane_b32 s53, v23, 12
+; GFX8-NEXT: v_readlane_b32 s52, v23, 11
+; GFX8-NEXT: v_readlane_b32 s51, v23, 10
+; GFX8-NEXT: v_readlane_b32 s50, v23, 9
+; GFX8-NEXT: v_readlane_b32 s49, v23, 8
+; GFX8-NEXT: v_readlane_b32 s48, v23, 7
+; GFX8-NEXT: v_readlane_b32 s39, v23, 6
+; GFX8-NEXT: v_readlane_b32 s38, v23, 5
+; GFX8-NEXT: v_readlane_b32 s37, v23, 4
+; GFX8-NEXT: v_readlane_b32 s36, v23, 3
+; GFX8-NEXT: v_readlane_b32 s35, v23, 2
+; GFX8-NEXT: v_readlane_b32 s34, v23, 1
+; GFX8-NEXT: v_readlane_b32 s33, v23, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: s_add_i32 s6, s32, 0x201100
; GFX900-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v23, s30, 0
-; GFX900-NEXT: v_writelane_b32 v23, s31, 1
-; GFX900-NEXT: v_writelane_b32 v23, s33, 2
-; GFX900-NEXT: v_writelane_b32 v23, s34, 3
-; GFX900-NEXT: v_writelane_b32 v23, s35, 4
-; GFX900-NEXT: v_writelane_b32 v23, s36, 5
-; GFX900-NEXT: v_writelane_b32 v23, s37, 6
-; GFX900-NEXT: v_writelane_b32 v23, s38, 7
-; GFX900-NEXT: v_writelane_b32 v23, s39, 8
-; GFX900-NEXT: v_writelane_b32 v23, s48, 9
-; GFX900-NEXT: v_writelane_b32 v23, s49, 10
-; GFX900-NEXT: v_writelane_b32 v23, s50, 11
-; GFX900-NEXT: v_writelane_b32 v23, s51, 12
-; GFX900-NEXT: v_writelane_b32 v23, s52, 13
+; GFX900-NEXT: v_writelane_b32 v23, s33, 0
+; GFX900-NEXT: v_writelane_b32 v23, s34, 1
+; GFX900-NEXT: v_writelane_b32 v23, s35, 2
+; GFX900-NEXT: v_writelane_b32 v23, s36, 3
+; GFX900-NEXT: v_writelane_b32 v23, s37, 4
+; GFX900-NEXT: v_writelane_b32 v23, s38, 5
+; GFX900-NEXT: v_writelane_b32 v23, s39, 6
+; GFX900-NEXT: v_writelane_b32 v23, s48, 7
+; GFX900-NEXT: v_writelane_b32 v23, s49, 8
+; GFX900-NEXT: v_writelane_b32 v23, s50, 9
+; GFX900-NEXT: v_writelane_b32 v23, s51, 10
+; GFX900-NEXT: v_writelane_b32 v23, s52, 11
+; GFX900-NEXT: v_writelane_b32 v23, s53, 12
+; GFX900-NEXT: v_writelane_b32 v23, s54, 13
; GFX900-NEXT: s_lshr_b32 s5, s32, 6
-; GFX900-NEXT: v_writelane_b32 v23, s53, 14
+; GFX900-NEXT: v_writelane_b32 v23, s55, 14
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: s_add_i32 s4, s5, 0x4240
; GFX900-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT: v_writelane_b32 v23, s54, 15
+; GFX900-NEXT: v_writelane_b32 v23, s30, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: v_writelane_b32 v22, s4, 0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s55, 16
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v23, 16
-; GFX900-NEXT: v_readlane_b32 s54, v23, 15
-; GFX900-NEXT: v_readlane_b32 s53, v23, 14
-; GFX900-NEXT: v_readlane_b32 s52, v23, 13
-; GFX900-NEXT: v_readlane_b32 s51, v23, 12
-; GFX900-NEXT: v_readlane_b32 s50, v23, 11
-; GFX900-NEXT: v_readlane_b32 s49, v23, 10
-; GFX900-NEXT: v_readlane_b32 s48, v23, 9
-; GFX900-NEXT: v_readlane_b32 s39, v23, 8
-; GFX900-NEXT: v_readlane_b32 s38, v23, 7
-; GFX900-NEXT: v_readlane_b32 s37, v23, 6
-; GFX900-NEXT: v_readlane_b32 s36, v23, 5
-; GFX900-NEXT: v_readlane_b32 s35, v23, 4
-; GFX900-NEXT: v_readlane_b32 s34, v23, 3
-; GFX900-NEXT: v_readlane_b32 s33, v23, 2
-; GFX900-NEXT: v_readlane_b32 s31, v23, 1
-; GFX900-NEXT: v_readlane_b32 s30, v23, 0
+; GFX900-NEXT: v_readlane_b32 s30, v23, 15
+; GFX900-NEXT: v_readlane_b32 s31, v23, 16
+; GFX900-NEXT: v_readlane_b32 s55, v23, 14
+; GFX900-NEXT: v_readlane_b32 s54, v23, 13
+; GFX900-NEXT: v_readlane_b32 s53, v23, 12
+; GFX900-NEXT: v_readlane_b32 s52, v23, 11
+; GFX900-NEXT: v_readlane_b32 s51, v23, 10
+; GFX900-NEXT: v_readlane_b32 s50, v23, 9
+; GFX900-NEXT: v_readlane_b32 s49, v23, 8
+; GFX900-NEXT: v_readlane_b32 s48, v23, 7
+; GFX900-NEXT: v_readlane_b32 s39, v23, 6
+; GFX900-NEXT: v_readlane_b32 s38, v23, 5
+; GFX900-NEXT: v_readlane_b32 s37, v23, 4
+; GFX900-NEXT: v_readlane_b32 s36, v23, 3
+; GFX900-NEXT: v_readlane_b32 s35, v23, 2
+; GFX900-NEXT: v_readlane_b32 s34, v23, 1
+; GFX900-NEXT: v_readlane_b32 s33, v23, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
; GFX942-NEXT: scratch_store_dword off, v22, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v22, s30, 0
-; GFX942-NEXT: v_writelane_b32 v22, s31, 1
-; GFX942-NEXT: v_writelane_b32 v22, s33, 2
-; GFX942-NEXT: v_writelane_b32 v22, s34, 3
-; GFX942-NEXT: v_writelane_b32 v22, s35, 4
-; GFX942-NEXT: v_writelane_b32 v22, s36, 5
-; GFX942-NEXT: v_writelane_b32 v22, s37, 6
-; GFX942-NEXT: v_writelane_b32 v22, s38, 7
-; GFX942-NEXT: v_writelane_b32 v22, s39, 8
-; GFX942-NEXT: v_writelane_b32 v22, s48, 9
-; GFX942-NEXT: v_writelane_b32 v22, s49, 10
-; GFX942-NEXT: v_writelane_b32 v22, s50, 11
-; GFX942-NEXT: v_writelane_b32 v22, s51, 12
-; GFX942-NEXT: v_writelane_b32 v22, s52, 13
-; GFX942-NEXT: v_writelane_b32 v22, s53, 14
+; GFX942-NEXT: v_writelane_b32 v22, s33, 0
+; GFX942-NEXT: v_writelane_b32 v22, s34, 1
+; GFX942-NEXT: v_writelane_b32 v22, s35, 2
+; GFX942-NEXT: v_writelane_b32 v22, s36, 3
+; GFX942-NEXT: v_writelane_b32 v22, s37, 4
+; GFX942-NEXT: v_writelane_b32 v22, s38, 5
+; GFX942-NEXT: v_writelane_b32 v22, s39, 6
+; GFX942-NEXT: v_writelane_b32 v22, s48, 7
+; GFX942-NEXT: v_writelane_b32 v22, s49, 8
+; GFX942-NEXT: v_writelane_b32 v22, s50, 9
+; GFX942-NEXT: v_writelane_b32 v22, s51, 10
+; GFX942-NEXT: v_writelane_b32 v22, s52, 11
+; GFX942-NEXT: v_writelane_b32 v22, s53, 12
+; GFX942-NEXT: v_writelane_b32 v22, s54, 13
+; GFX942-NEXT: v_writelane_b32 v22, s55, 14
; GFX942-NEXT: s_add_i32 s0, s32, 64
-; GFX942-NEXT: v_writelane_b32 v22, s54, 15
+; GFX942-NEXT: v_writelane_b32 v22, s30, 15
; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_writelane_b32 v22, s55, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_writelane_b32 v22, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v22, 16
-; GFX942-NEXT: v_readlane_b32 s54, v22, 15
-; GFX942-NEXT: v_readlane_b32 s53, v22, 14
-; GFX942-NEXT: v_readlane_b32 s52, v22, 13
-; GFX942-NEXT: v_readlane_b32 s51, v22, 12
-; GFX942-NEXT: v_readlane_b32 s50, v22, 11
-; GFX942-NEXT: v_readlane_b32 s49, v22, 10
-; GFX942-NEXT: v_readlane_b32 s48, v22, 9
-; GFX942-NEXT: v_readlane_b32 s39, v22, 8
-; GFX942-NEXT: v_readlane_b32 s38, v22, 7
-; GFX942-NEXT: v_readlane_b32 s37, v22, 6
-; GFX942-NEXT: v_readlane_b32 s36, v22, 5
-; GFX942-NEXT: v_readlane_b32 s35, v22, 4
-; GFX942-NEXT: v_readlane_b32 s34, v22, 3
-; GFX942-NEXT: v_readlane_b32 s33, v22, 2
-; GFX942-NEXT: v_readlane_b32 s31, v22, 1
-; GFX942-NEXT: v_readlane_b32 s30, v22, 0
+; GFX942-NEXT: v_readlane_b32 s30, v22, 15
+; GFX942-NEXT: v_readlane_b32 s31, v22, 16
+; GFX942-NEXT: v_readlane_b32 s55, v22, 14
+; GFX942-NEXT: v_readlane_b32 s54, v22, 13
+; GFX942-NEXT: v_readlane_b32 s53, v22, 12
+; GFX942-NEXT: v_readlane_b32 s52, v22, 11
+; GFX942-NEXT: v_readlane_b32 s51, v22, 10
+; GFX942-NEXT: v_readlane_b32 s50, v22, 9
+; GFX942-NEXT: v_readlane_b32 s49, v22, 8
+; GFX942-NEXT: v_readlane_b32 s48, v22, 7
+; GFX942-NEXT: v_readlane_b32 s39, v22, 6
+; GFX942-NEXT: v_readlane_b32 s38, v22, 5
+; GFX942-NEXT: v_readlane_b32 s37, v22, 4
+; GFX942-NEXT: v_readlane_b32 s36, v22, 3
+; GFX942-NEXT: v_readlane_b32 s35, v22, 2
+; GFX942-NEXT: v_readlane_b32 s34, v22, 1
+; GFX942-NEXT: v_readlane_b32 s33, v22, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
; GFX942-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload
@@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v22, s33, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240
-; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1
+; GFX10_1-NEXT: v_writelane_b32 v22, s34, 1
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_writelane_b32 v22, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v22, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v22, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v22, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v22, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v22, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v22, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v22, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v22, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v22, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v22, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v22, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v22, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v22, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v22, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v22, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v22, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v22, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v22, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v22, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v22, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v22, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v22, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v22, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v22, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v22, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v22, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v22, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v22, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v22, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v22, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v22, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v22, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v22, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v22, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v22, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v22, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v22, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v22, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v22, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v22, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v22, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v22, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v22, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v22, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v22, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v22, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v22, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v22, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v22, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v22, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v22, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v22, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v22, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v22, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v22, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v22, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v22, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v22, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v22, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v22, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v22, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v22, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_1-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v22, s33, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240
-; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1
+; GFX10_3-NEXT: v_writelane_b32 v22, s34, 1
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_writelane_b32 v22, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v22, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v22, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v22, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v22, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v22, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v22, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v22, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v22, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v22, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v22, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v22, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v22, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v22, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v22, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v22, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v22, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v22, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v22, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v22, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v22, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v22, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v22, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v22, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v22, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v22, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v22, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v22, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v22, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v22, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v22, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v22, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v22, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v22, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v22, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v22, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v22, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v22, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v22, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v22, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v22, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v22, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v22, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v22, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v22, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v22, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v22, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v22, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v22, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v22, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v22, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v22, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v22, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v22, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v22, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v22, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v22, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v22, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v22, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v22, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v22, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v22, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v22, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v22, s30, 0
+; GFX11-NEXT: v_writelane_b32 v22, s33, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_add_i32 s58, s32, 0x4240
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v22, s31, 1
+; GFX11-NEXT: v_writelane_b32 v22, s34, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v22, s33, 2
-; GFX11-NEXT: v_writelane_b32 v22, s34, 3
-; GFX11-NEXT: v_writelane_b32 v22, s35, 4
-; GFX11-NEXT: v_writelane_b32 v22, s36, 5
-; GFX11-NEXT: v_writelane_b32 v22, s37, 6
-; GFX11-NEXT: v_writelane_b32 v22, s38, 7
-; GFX11-NEXT: v_writelane_b32 v22, s39, 8
-; GFX11-NEXT: v_writelane_b32 v22, s48, 9
-; GFX11-NEXT: v_writelane_b32 v22, s49, 10
-; GFX11-NEXT: v_writelane_b32 v22, s50, 11
-; GFX11-NEXT: v_writelane_b32 v22, s51, 12
-; GFX11-NEXT: v_writelane_b32 v22, s52, 13
-; GFX11-NEXT: v_writelane_b32 v22, s53, 14
-; GFX11-NEXT: v_writelane_b32 v22, s54, 15
-; GFX11-NEXT: v_writelane_b32 v22, s55, 16
+; GFX11-NEXT: v_writelane_b32 v22, s35, 2
+; GFX11-NEXT: v_writelane_b32 v22, s36, 3
+; GFX11-NEXT: v_writelane_b32 v22, s37, 4
+; GFX11-NEXT: v_writelane_b32 v22, s38, 5
+; GFX11-NEXT: v_writelane_b32 v22, s39, 6
+; GFX11-NEXT: v_writelane_b32 v22, s48, 7
+; GFX11-NEXT: v_writelane_b32 v22, s49, 8
+; GFX11-NEXT: v_writelane_b32 v22, s50, 9
+; GFX11-NEXT: v_writelane_b32 v22, s51, 10
+; GFX11-NEXT: v_writelane_b32 v22, s52, 11
+; GFX11-NEXT: v_writelane_b32 v22, s53, 12
+; GFX11-NEXT: v_writelane_b32 v22, s54, 13
+; GFX11-NEXT: v_writelane_b32 v22, s55, 14
+; GFX11-NEXT: v_writelane_b32 v22, s30, 15
+; GFX11-NEXT: v_writelane_b32 v22, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s55, v22, 16
-; GFX11-NEXT: v_readlane_b32 s54, v22, 15
-; GFX11-NEXT: v_readlane_b32 s53, v22, 14
-; GFX11-NEXT: v_readlane_b32 s52, v22, 13
-; GFX11-NEXT: v_readlane_b32 s51, v22, 12
-; GFX11-NEXT: v_readlane_b32 s50, v22, 11
-; GFX11-NEXT: v_readlane_b32 s49, v22, 10
-; GFX11-NEXT: v_readlane_b32 s48, v22, 9
-; GFX11-NEXT: v_readlane_b32 s39, v22, 8
-; GFX11-NEXT: v_readlane_b32 s38, v22, 7
-; GFX11-NEXT: v_readlane_b32 s37, v22, 6
-; GFX11-NEXT: v_readlane_b32 s36, v22, 5
-; GFX11-NEXT: v_readlane_b32 s35, v22, 4
-; GFX11-NEXT: v_readlane_b32 s34, v22, 3
-; GFX11-NEXT: v_readlane_b32 s33, v22, 2
-; GFX11-NEXT: v_readlane_b32 s31, v22, 1
-; GFX11-NEXT: v_readlane_b32 s30, v22, 0
+; GFX11-NEXT: v_readlane_b32 s30, v22, 15
+; GFX11-NEXT: v_readlane_b32 s31, v22, 16
+; GFX11-NEXT: v_readlane_b32 s55, v22, 14
+; GFX11-NEXT: v_readlane_b32 s54, v22, 13
+; GFX11-NEXT: v_readlane_b32 s53, v22, 12
+; GFX11-NEXT: v_readlane_b32 s52, v22, 11
+; GFX11-NEXT: v_readlane_b32 s51, v22, 10
+; GFX11-NEXT: v_readlane_b32 s50, v22, 9
+; GFX11-NEXT: v_readlane_b32 s49, v22, 8
+; GFX11-NEXT: v_readlane_b32 s48, v22, 7
+; GFX11-NEXT: v_readlane_b32 s39, v22, 6
+; GFX11-NEXT: v_readlane_b32 s38, v22, 5
+; GFX11-NEXT: v_readlane_b32 s37, v22, 4
+; GFX11-NEXT: v_readlane_b32 s36, v22, 3
+; GFX11-NEXT: v_readlane_b32 s35, v22, 2
+; GFX11-NEXT: v_readlane_b32 s34, v22, 1
+; GFX11-NEXT: v_readlane_b32 s33, v22, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload
@@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v22, s30, 0
+; GFX12-NEXT: v_writelane_b32 v22, s33, 0
; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_writelane_b32 v22, s31, 1
-; GFX12-NEXT: v_writelane_b32 v22, s33, 2
-; GFX12-NEXT: v_writelane_b32 v22, s34, 3
-; GFX12-NEXT: v_writelane_b32 v22, s35, 4
-; GFX12-NEXT: v_writelane_b32 v22, s36, 5
-; GFX12-NEXT: v_writelane_b32 v22, s37, 6
-; GFX12-NEXT: v_writelane_b32 v22, s38, 7
-; GFX12-NEXT: v_writelane_b32 v22, s39, 8
-; GFX12-NEXT: v_writelane_b32 v22, s48, 9
-; GFX12-NEXT: v_writelane_b32 v22, s49, 10
-; GFX12-NEXT: v_writelane_b32 v22, s50, 11
-; GFX12-NEXT: v_writelane_b32 v22, s51, 12
-; GFX12-NEXT: v_writelane_b32 v22, s52, 13
-; GFX12-NEXT: v_writelane_b32 v22, s53, 14
-; GFX12-NEXT: v_writelane_b32 v22, s54, 15
-; GFX12-NEXT: v_writelane_b32 v22, s55, 16
+; GFX12-NEXT: v_writelane_b32 v22, s34, 1
+; GFX12-NEXT: v_writelane_b32 v22, s35, 2
+; GFX12-NEXT: v_writelane_b32 v22, s36, 3
+; GFX12-NEXT: v_writelane_b32 v22, s37, 4
+; GFX12-NEXT: v_writelane_b32 v22, s38, 5
+; GFX12-NEXT: v_writelane_b32 v22, s39, 6
+; GFX12-NEXT: v_writelane_b32 v22, s48, 7
+; GFX12-NEXT: v_writelane_b32 v22, s49, 8
+; GFX12-NEXT: v_writelane_b32 v22, s50, 9
+; GFX12-NEXT: v_writelane_b32 v22, s51, 10
+; GFX12-NEXT: v_writelane_b32 v22, s52, 11
+; GFX12-NEXT: v_writelane_b32 v22, s53, 12
+; GFX12-NEXT: v_writelane_b32 v22, s54, 13
+; GFX12-NEXT: v_writelane_b32 v22, s55, 14
+; GFX12-NEXT: v_writelane_b32 v22, s30, 15
+; GFX12-NEXT: v_writelane_b32 v22, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v22, 16
-; GFX12-NEXT: v_readlane_b32 s54, v22, 15
-; GFX12-NEXT: v_readlane_b32 s53, v22, 14
-; GFX12-NEXT: v_readlane_b32 s52, v22, 13
-; GFX12-NEXT: v_readlane_b32 s51, v22, 12
-; GFX12-NEXT: v_readlane_b32 s50, v22, 11
-; GFX12-NEXT: v_readlane_b32 s49, v22, 10
-; GFX12-NEXT: v_readlane_b32 s48, v22, 9
-; GFX12-NEXT: v_readlane_b32 s39, v22, 8
-; GFX12-NEXT: v_readlane_b32 s38, v22, 7
-; GFX12-NEXT: v_readlane_b32 s37, v22, 6
-; GFX12-NEXT: v_readlane_b32 s36, v22, 5
-; GFX12-NEXT: v_readlane_b32 s35, v22, 4
-; GFX12-NEXT: v_readlane_b32 s34, v22, 3
-; GFX12-NEXT: v_readlane_b32 s33, v22, 2
-; GFX12-NEXT: v_readlane_b32 s31, v22, 1
-; GFX12-NEXT: v_readlane_b32 s30, v22, 0
+; GFX12-NEXT: v_readlane_b32 s30, v22, 15
+; GFX12-NEXT: v_readlane_b32 s31, v22, 16
+; GFX12-NEXT: v_readlane_b32 s55, v22, 14
+; GFX12-NEXT: v_readlane_b32 s54, v22, 13
+; GFX12-NEXT: v_readlane_b32 s53, v22, 12
+; GFX12-NEXT: v_readlane_b32 s52, v22, 11
+; GFX12-NEXT: v_readlane_b32 s51, v22, 10
+; GFX12-NEXT: v_readlane_b32 s50, v22, 9
+; GFX12-NEXT: v_readlane_b32 s49, v22, 8
+; GFX12-NEXT: v_readlane_b32 s48, v22, 7
+; GFX12-NEXT: v_readlane_b32 s39, v22, 6
+; GFX12-NEXT: v_readlane_b32 s38, v22, 5
+; GFX12-NEXT: v_readlane_b32 s37, v22, 4
+; GFX12-NEXT: v_readlane_b32 s36, v22, 3
+; GFX12-NEXT: v_readlane_b32 s35, v22, 2
+; GFX12-NEXT: v_readlane_b32 s34, v22, 1
+; GFX12-NEXT: v_readlane_b32 s33, v22, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 33cd598aae9b5..7b87cb014ecbf 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_writelane_b32 v43, s4, 5
-; GFX9-NEXT: v_writelane_b32 v43, s30, 0
-; GFX9-NEXT: v_writelane_b32 v43, s31, 1
; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v43, s34, 2
-; GFX9-NEXT: v_writelane_b32 v43, s36, 3
+; GFX9-NEXT: v_writelane_b32 v43, s34, 0
+; GFX9-NEXT: v_writelane_b32 v43, s36, 1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s37, 4
+; GFX9-NEXT: v_writelane_b32 v43, s37, 2
; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v41, v0
+; GFX9-NEXT: v_writelane_b32 v43, s30, 3
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT: v_writelane_b32 v43, s31, 4
; GFX9-NEXT: s_mov_b32 s34, s15
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s37, v43, 4
-; GFX9-NEXT: v_readlane_b32 s36, v43, 3
-; GFX9-NEXT: v_readlane_b32 s34, v43, 2
-; GFX9-NEXT: v_readlane_b32 s31, v43, 1
-; GFX9-NEXT: v_readlane_b32 s30, v43, 0
+; GFX9-NEXT: v_readlane_b32 s30, v43, 3
+; GFX9-NEXT: v_readlane_b32 s31, v43, 4
+; GFX9-NEXT: v_readlane_b32 s37, v43, 2
+; GFX9-NEXT: v_readlane_b32 s36, v43, 1
+; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v43, 5
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 65446a036c91b..878302e4865bb 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
+; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
@@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
+; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
+; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index ccaf0ac5377e4..8394b325bee6d 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 6b6c60ebe2a9e..133cc166c3311 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: .Ltmp1:
; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index dba10f19eb500..ef5681798fb19 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() {
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: v_readlane_b32 s30, v2, 0
+; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 925984b15367d..4a2829947f9a0 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,16 +28,16 @@ body: |
; GCN-LABEL: name: test_main
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
- ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
+ ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33
; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32
; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.68, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
@@ -66,48 +66,48 @@ body: |
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
; GCN-NEXT: $sgpr22 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
@@ -130,48 +130,48 @@ body: |
; GCN-NEXT: bb.3:
; GCN-NEXT: liveins: $vcc_hi
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
- ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
- ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
- ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
- ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
- ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
- ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
- ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
- ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
- ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
- ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
- ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
- ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
- ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
- ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
- ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
- ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
- ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
- ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
- ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
- ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
- ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
- ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
- ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
- ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
- ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
- ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
- ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
- ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
- ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
- ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
- ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
- ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
- ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
- ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
- ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
- ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
- ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
- ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
- ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
- ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
- ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+ ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31
+ ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+ ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+ ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+ ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+ ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+ ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+ ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+ ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+ ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+ ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+ ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+ ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+ ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+ ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+ ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+ ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+ ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+ ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+ ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+ ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+ ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+ ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+ ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+ ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+ ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+ ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+ ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+ ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+ ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+ ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+ ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+ ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+ ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+ ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+ ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+ ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+ ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+ ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+ ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+ ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+ ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
@@ -200,11 +200,11 @@ body: |
; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
; GCN-NEXT: $sgpr32 = frame-destroy COPY $sgpr33
; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
- ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
- ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
- ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
- ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+ ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.68, addrspace 5)
+ ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
+ ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
+ ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
+ ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi
; GCN-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 702953c56a5cb..cb54b0ba629c3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v255, 1
; GCN-NEXT: v_readlane_b32 s30, v255, 0
+; GCN-NEXT: v_readlane_b32 s31, v255, 1
; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v254, 1
; GCN-NEXT: v_readlane_b32 s30, v254, 0
+; GCN-NEXT: v_readlane_b32 s31, v254, 1
; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -1632,21 +1632,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 s[16:17], exec
-; GCN-NEXT: s_mov_b64 exec, 1
+; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: v_writelane_b32 v0, s30, 0
+; GCN-NEXT: v_writelane_b32 v0, s31, 1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: s_mov_b64 s[16:17], exec
-; GCN-NEXT: s_mov_b64 exec, 1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: v_writelane_b32 v0, s31, 0
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, child_function_ipra at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, child_function_ipra at rel32@hi+12
@@ -1656,20 +1649,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b64 exec, 1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v0, 0
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b64 exec, 1
+; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s30, v0, 0
+; GCN-NEXT: v_readlane_b32 s31, v0, 1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 1c2215d39dc02..feaca47f98e36 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s8, s30
; GFX900-NEXT: s_mov_b32 s9, s31
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s12
; GFX900-NEXT: s_mov_b32 s11, s13
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s8, s30
; GFX90A-NEXT: s_mov_b32 s9, s31
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s12
; GFX90A-NEXT: s_mov_b32 s11, s13
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s8, s30
; GFX900-NEXT: s_mov_b32 s9, s31
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s12
; GFX900-NEXT: s_mov_b32 s11, s13
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s8, s30
; GFX90A-NEXT: s_mov_b32 s9, s31
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s12
; GFX90A-NEXT: s_mov_b32 s11, s13
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
; GFX900-NEXT: s_mov_b32 s12, s30
; GFX900-NEXT: s_mov_b32 s13, s31
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
; GFX90A-NEXT: s_mov_b32 s12, s30
; GFX90A-NEXT: s_mov_b32 s13, s31
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
; GFX942-NEXT: s_mov_b32 s12, s30
; GFX942-NEXT: s_mov_b32 s13, s31
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s16
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s17
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s19
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s18
; GFX900-NEXT: s_mov_b32 s9, s19
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s18
; GFX90A-NEXT: s_mov_b32 s9, s19
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s20
; GFX900-NEXT: s_mov_b32 s11, s21
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s20
; GFX90A-NEXT: s_mov_b32 s11, s21
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s10, s20
; GFX942-NEXT: s_mov_b32 s11, s21
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s22
; GFX900-NEXT: s_mov_b32 s11, s23
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s22
; GFX90A-NEXT: s_mov_b32 s11, s23
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s10, s22
; GFX942-NEXT: s_mov_b32 s11, s23
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s18
; GFX900-NEXT: s_mov_b32 s9, s19
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s18
; GFX90A-NEXT: s_mov_b32 s9, s19
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
; GFX900-NEXT: s_mov_b32 s26, s12
; GFX900-NEXT: s_mov_b32 s27, s13
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
; GFX90A-NEXT: s_mov_b32 s26, s12
; GFX90A-NEXT: s_mov_b32 s27, s13
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
; GFX900-NEXT: s_mov_b32 s31, s13
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
; GFX90A-NEXT: s_mov_b32 s31, s13
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
; GFX900-NEXT: s_mov_b32 s12, s18
; GFX900-NEXT: s_mov_b32 s13, s19
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
; GFX90A-NEXT: s_mov_b32 s12, s18
; GFX90A-NEXT: s_mov_b32 s13, s19
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
; GFX900-NEXT: s_mov_b32 s12, s22
; GFX900-NEXT: s_mov_b32 s13, s23
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
; GFX90A-NEXT: s_mov_b32 s12, s22
; GFX90A-NEXT: s_mov_b32 s13, s23
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
; GFX900-NEXT: s_mov_b32 s26, s14
; GFX900-NEXT: s_mov_b32 s27, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
; GFX90A-NEXT: s_mov_b32 s26, s14
; GFX90A-NEXT: s_mov_b32 s27, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
; GFX900-NEXT: s_mov_b32 s12, s26
; GFX900-NEXT: s_mov_b32 s13, s27
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
; GFX90A-NEXT: s_mov_b32 s12, s26
; GFX90A-NEXT: s_mov_b32 s13, s27
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
; GFX900-NEXT: s_mov_b32 s31, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
; GFX90A-NEXT: s_mov_b32 s31, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX942-NEXT: s_mov_b32 s31, s13
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
; GFX942-NEXT: s_mov_b32 s12, s18
; GFX942-NEXT: s_mov_b32 s13, s19
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
; GFX942-NEXT: s_mov_b32 s12, s22
; GFX942-NEXT: s_mov_b32 s13, s23
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
; GFX942-NEXT: s_mov_b32 s12, s26
; GFX942-NEXT: s_mov_b32 s13, s27
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX942-NEXT: s_mov_b32 s31, s15
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
; GFX900-NEXT: s_mov_b32 s14, s18
; GFX900-NEXT: s_mov_b32 s15, s19
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
; GFX90A-NEXT: s_mov_b32 s14, s18
; GFX90A-NEXT: s_mov_b32 s15, s19
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[8:23]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s8, s26
; GFX942-NEXT: s_mov_b32 s9, s27
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
; GFX942-NEXT: s_mov_b32 s14, s18
; GFX942-NEXT: s_mov_b32 s15, s19
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
; GFX900-NEXT: s_mov_b32 s14, s20
; GFX900-NEXT: s_mov_b32 s15, s21
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
; GFX90A-NEXT: s_mov_b32 s14, s20
; GFX90A-NEXT: s_mov_b32 s15, s21
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s20
@@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s20
@@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
; GFX942-NEXT: s_mov_b32 s14, s20
; GFX942-NEXT: s_mov_b32 s15, s21
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s20
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s21
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s8, s22
; GFX942-NEXT: s_mov_b32 s9, s23
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
; GFX900-NEXT: s_mov_b32 s14, s22
; GFX900-NEXT: s_mov_b32 s15, s23
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
; GFX90A-NEXT: s_mov_b32 s14, s22
; GFX90A-NEXT: s_mov_b32 s15, s23
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
; GFX942-NEXT: s_mov_b32 s14, s22
; GFX942-NEXT: s_mov_b32 s15, s23
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
; GFX900-NEXT: s_mov_b32 s14, s24
; GFX900-NEXT: s_mov_b32 s15, s25
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
; GFX90A-NEXT: s_mov_b32 s14, s24
; GFX90A-NEXT: s_mov_b32 s15, s25
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s24
@@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s24
@@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
; GFX942-NEXT: s_mov_b32 s14, s24
; GFX942-NEXT: s_mov_b32 s15, s25
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s24
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s25
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
; GFX900-NEXT: s_mov_b32 s14, s26
; GFX900-NEXT: s_mov_b32 s15, s27
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
; GFX90A-NEXT: s_mov_b32 s14, s26
; GFX90A-NEXT: s_mov_b32 s15, s27
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
; GFX900-NEXT: s_mov_b32 s24, s14
; GFX900-NEXT: s_mov_b32 s25, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
; GFX90A-NEXT: s_mov_b32 s24, s14
; GFX90A-NEXT: s_mov_b32 s25, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
; GFX942-NEXT: s_mov_b32 s14, s26
; GFX942-NEXT: s_mov_b32 s15, s27
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
; GFX900-NEXT: s_mov_b32 s14, s28
; GFX900-NEXT: s_mov_b32 s15, s29
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
; GFX90A-NEXT: s_mov_b32 s14, s28
; GFX90A-NEXT: s_mov_b32 s15, s29
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s28
@@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s28
@@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
; GFX942-NEXT: s_mov_b32 s14, s28
; GFX942-NEXT: s_mov_b32 s15, s29
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s28
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s29
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
; GFX900-NEXT: s_mov_b32 s14, s30
; GFX900-NEXT: s_mov_b32 s15, s31
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
; GFX90A-NEXT: s_mov_b32 s14, s30
; GFX90A-NEXT: s_mov_b32 s15, s31
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
; GFX900-NEXT: s_mov_b32 s29, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
; GFX90A-NEXT: s_mov_b32 s29, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
; GFX942-NEXT: s_mov_b32 s14, s30
; GFX942-NEXT: s_mov_b32 s15, s31
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s28, s14
; GFX942-NEXT: s_mov_b32 s29, s15
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 1ffef8e60d90d..ea67593d72761 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -24,10 +24,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -89,10 +89,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -152,10 +152,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 00214ef36e1f0..d4b53e9d338f8 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
; GCN-NEXT: v_mov_b32_e32 v1, v40
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
; GCN-NEXT: v_readlane_b32 s31, v42, 1
-; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s6, v42, 2
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; FIJI-NEXT: s_mov_b64 exec, s[18:19]
; FIJI-NEXT: v_writelane_b32 v40, s16, 18
-; FIJI-NEXT: v_writelane_b32 v40, s30, 0
-; FIJI-NEXT: v_writelane_b32 v40, s31, 1
-; FIJI-NEXT: v_writelane_b32 v40, s34, 2
-; FIJI-NEXT: v_writelane_b32 v40, s35, 3
-; FIJI-NEXT: v_writelane_b32 v40, s36, 4
-; FIJI-NEXT: v_writelane_b32 v40, s37, 5
-; FIJI-NEXT: v_writelane_b32 v40, s38, 6
-; FIJI-NEXT: v_writelane_b32 v40, s39, 7
-; FIJI-NEXT: v_writelane_b32 v40, s48, 8
-; FIJI-NEXT: v_writelane_b32 v40, s49, 9
-; FIJI-NEXT: v_writelane_b32 v40, s50, 10
-; FIJI-NEXT: v_writelane_b32 v40, s51, 11
-; FIJI-NEXT: v_writelane_b32 v40, s52, 12
-; FIJI-NEXT: v_writelane_b32 v40, s53, 13
-; FIJI-NEXT: v_writelane_b32 v40, s54, 14
-; FIJI-NEXT: v_writelane_b32 v40, s55, 15
-; FIJI-NEXT: v_writelane_b32 v40, s64, 16
+; FIJI-NEXT: v_writelane_b32 v40, s34, 0
+; FIJI-NEXT: v_writelane_b32 v40, s35, 1
+; FIJI-NEXT: v_writelane_b32 v40, s36, 2
+; FIJI-NEXT: v_writelane_b32 v40, s37, 3
+; FIJI-NEXT: v_writelane_b32 v40, s38, 4
+; FIJI-NEXT: v_writelane_b32 v40, s39, 5
+; FIJI-NEXT: v_writelane_b32 v40, s48, 6
+; FIJI-NEXT: v_writelane_b32 v40, s49, 7
+; FIJI-NEXT: v_writelane_b32 v40, s50, 8
+; FIJI-NEXT: v_writelane_b32 v40, s51, 9
+; FIJI-NEXT: v_writelane_b32 v40, s52, 10
+; FIJI-NEXT: v_writelane_b32 v40, s53, 11
+; FIJI-NEXT: v_writelane_b32 v40, s54, 12
+; FIJI-NEXT: v_writelane_b32 v40, s55, 13
+; FIJI-NEXT: v_writelane_b32 v40, s64, 14
+; FIJI-NEXT: v_writelane_b32 v40, s65, 15
+; FIJI-NEXT: v_writelane_b32 v40, s30, 16
; FIJI-NEXT: s_mov_b32 s50, s15
; FIJI-NEXT: s_mov_b32 s51, s14
; FIJI-NEXT: s_mov_b32 s52, s13
@@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; FIJI-NEXT: s_mov_b64 s[54:55], exec
; FIJI-NEXT: s_addk_i32 s32, 0x400
-; FIJI-NEXT: v_writelane_b32 v40, s65, 17
+; FIJI-NEXT: v_writelane_b32 v40, s31, 17
; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; FIJI-NEXT: v_readfirstlane_b32 s16, v0
; FIJI-NEXT: v_readfirstlane_b32 s17, v1
@@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: s_cbranch_execnz .LBB18_1
; FIJI-NEXT: ; %bb.2:
; FIJI-NEXT: s_mov_b64 exec, s[54:55]
+; FIJI-NEXT: v_readlane_b32 s30, v40, 16
; FIJI-NEXT: v_mov_b32_e32 v0, v4
-; FIJI-NEXT: v_readlane_b32 s65, v40, 17
-; FIJI-NEXT: v_readlane_b32 s64, v40, 16
-; FIJI-NEXT: v_readlane_b32 s55, v40, 15
-; FIJI-NEXT: v_readlane_b32 s54, v40, 14
-; FIJI-NEXT: v_readlane_b32 s53, v40, 13
-; FIJI-NEXT: v_readlane_b32 s52, v40, 12
-; FIJI-NEXT: v_readlane_b32 s51, v40, 11
-; FIJI-NEXT: v_readlane_b32 s50, v40, 10
-; FIJI-NEXT: v_readlane_b32 s49, v40, 9
-; FIJI-NEXT: v_readlane_b32 s48, v40, 8
-; FIJI-NEXT: v_readlane_b32 s39, v40, 7
-; FIJI-NEXT: v_readlane_b32 s38, v40, 6
-; FIJI-NEXT: v_readlane_b32 s37, v40, 5
-; FIJI-NEXT: v_readlane_b32 s36, v40, 4
-; FIJI-NEXT: v_readlane_b32 s35, v40, 3
-; FIJI-NEXT: v_readlane_b32 s34, v40, 2
-; FIJI-NEXT: v_readlane_b32 s31, v40, 1
-; FIJI-NEXT: v_readlane_b32 s30, v40, 0
+; FIJI-NEXT: v_readlane_b32 s31, v40, 17
+; FIJI-NEXT: v_readlane_b32 s65, v40, 15
+; FIJI-NEXT: v_readlane_b32 s64, v40, 14
+; FIJI-NEXT: v_readlane_b32 s55, v40, 13
+; FIJI-NEXT: v_readlane_b32 s54, v40, 12
+; FIJI-NEXT: v_readlane_b32 s53, v40, 11
+; FIJI-NEXT: v_readlane_b32 s52, v40, 10
+; FIJI-NEXT: v_readlane_b32 s51, v40, 9
+; FIJI-NEXT: v_readlane_b32 s50, v40, 8
+; FIJI-NEXT: v_readlane_b32 s49, v40, 7
+; FIJI-NEXT: v_readlane_b32 s48, v40, 6
+; FIJI-NEXT: v_readlane_b32 s39, v40, 5
+; FIJI-NEXT: v_readlane_b32 s38, v40, 4
+; FIJI-NEXT: v_readlane_b32 s37, v40, 3
+; FIJI-NEXT: v_readlane_b32 s36, v40, 2
+; FIJI-NEXT: v_readlane_b32 s35, v40, 1
+; FIJI-NEXT: v_readlane_b32 s34, v40, 0
; FIJI-NEXT: s_mov_b32 s32, s33
; FIJI-NEXT: v_readlane_b32 s4, v40, 18
; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HAWAII-NEXT: s_mov_b64 exec, s[18:19]
; HAWAII-NEXT: v_writelane_b32 v40, s16, 18
-; HAWAII-NEXT: v_writelane_b32 v40, s30, 0
-; HAWAII-NEXT: v_writelane_b32 v40, s31, 1
-; HAWAII-NEXT: v_writelane_b32 v40, s34, 2
-; HAWAII-NEXT: v_writelane_b32 v40, s35, 3
-; HAWAII-NEXT: v_writelane_b32 v40, s36, 4
-; HAWAII-NEXT: v_writelane_b32 v40, s37, 5
-; HAWAII-NEXT: v_writelane_b32 v40, s38, 6
-; HAWAII-NEXT: v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT: v_writelane_b32 v40, s48, 8
-; HAWAII-NEXT: v_writelane_b32 v40, s49, 9
-; HAWAII-NEXT: v_writelane_b32 v40, s50, 10
-; HAWAII-NEXT: v_writelane_b32 v40, s51, 11
-; HAWAII-NEXT: v_writelane_b32 v40, s52, 12
-; HAWAII-NEXT: v_writelane_b32 v40, s53, 13
-; HAWAII-NEXT: v_writelane_b32 v40, s54, 14
-; HAWAII-NEXT: v_writelane_b32 v40, s55, 15
-; HAWAII-NEXT: v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT: v_writelane_b32 v40, s34, 0
+; HAWAII-NEXT: v_writelane_b32 v40, s35, 1
+; HAWAII-NEXT: v_writelane_b32 v40, s36, 2
+; HAWAII-NEXT: v_writelane_b32 v40, s37, 3
+; HAWAII-NEXT: v_writelane_b32 v40, s38, 4
+; HAWAII-NEXT: v_writelane_b32 v40, s39, 5
+; HAWAII-NEXT: v_writelane_b32 v40, s48, 6
+; HAWAII-NEXT: v_writelane_b32 v40, s49, 7
+; HAWAII-NEXT: v_writelane_b32 v40, s50, 8
+; HAWAII-NEXT: v_writelane_b32 v40, s51, 9
+; HAWAII-NEXT: v_writelane_b32 v40, s52, 10
+; HAWAII-NEXT: v_writelane_b32 v40, s53, 11
+; HAWAII-NEXT: v_writelane_b32 v40, s54, 12
+; HAWAII-NEXT: v_writelane_b32 v40, s55, 13
+; HAWAII-NEXT: v_writelane_b32 v40, s64, 14
+; HAWAII-NEXT: v_writelane_b32 v40, s65, 15
+; HAWAII-NEXT: v_writelane_b32 v40, s30, 16
; HAWAII-NEXT: s_mov_b32 s50, s15
; HAWAII-NEXT: s_mov_b32 s51, s14
; HAWAII-NEXT: s_mov_b32 s52, s13
@@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; HAWAII-NEXT: s_mov_b64 s[54:55], exec
; HAWAII-NEXT: s_addk_i32 s32, 0x400
-; HAWAII-NEXT: v_writelane_b32 v40, s65, 17
+; HAWAII-NEXT: v_writelane_b32 v40, s31, 17
; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; HAWAII-NEXT: v_readfirstlane_b32 s16, v0
; HAWAII-NEXT: v_readfirstlane_b32 s17, v1
@@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: s_cbranch_execnz .LBB18_1
; HAWAII-NEXT: ; %bb.2:
; HAWAII-NEXT: s_mov_b64 exec, s[54:55]
+; HAWAII-NEXT: v_readlane_b32 s30, v40, 16
; HAWAII-NEXT: v_mov_b32_e32 v0, v4
-; HAWAII-NEXT: v_readlane_b32 s65, v40, 17
-; HAWAII-NEXT: v_readlane_b32 s64, v40, 16
-; HAWAII-NEXT: v_readlane_b32 s55, v40, 15
-; HAWAII-NEXT: v_readlane_b32 s54, v40, 14
-; HAWAII-NEXT: v_readlane_b32 s53, v40, 13
-; HAWAII-NEXT: v_readlane_b32 s52, v40, 12
-; HAWAII-NEXT: v_readlane_b32 s51, v40, 11
-; HAWAII-NEXT: v_readlane_b32 s50, v40, 10
-; HAWAII-NEXT: v_readlane_b32 s49, v40, 9
-; HAWAII-NEXT: v_readlane_b32 s48, v40, 8
-; HAWAII-NEXT: v_readlane_b32 s39, v40, 7
-; HAWAII-NEXT: v_readlane_b32 s38, v40, 6
-; HAWAII-NEXT: v_readlane_b32 s37, v40, 5
-; HAWAII-NEXT: v_readlane_b32 s36, v40, 4
-; HAWAII-NEXT: v_readlane_b32 s35, v40, 3
-; HAWAII-NEXT: v_readlane_b32 s34, v40, 2
-; HAWAII-NEXT: v_readlane_b32 s31, v40, 1
-; HAWAII-NEXT: v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT: v_readlane_b32 s31, v40, 17
+; HAWAII-NEXT: v_readlane_b32 s65, v40, 15
+; HAWAII-NEXT: v_readlane_b32 s64, v40, 14
+; HAWAII-NEXT: v_readlane_b32 s55, v40, 13
+; HAWAII-NEXT: v_readlane_b32 s54, v40, 12
+; HAWAII-NEXT: v_readlane_b32 s53, v40, 11
+; HAWAII-NEXT: v_readlane_b32 s52, v40, 10
+; HAWAII-NEXT: v_readlane_b32 s51, v40, 9
+; HAWAII-NEXT: v_readlane_b32 s50, v40, 8
+; HAWAII-NEXT: v_readlane_b32 s49, v40, 7
+; HAWAII-NEXT: v_readlane_b32 s48, v40, 6
+; HAWAII-NEXT: v_readlane_b32 s39, v40, 5
+; HAWAII-NEXT: v_readlane_b32 s38, v40, 4
+; HAWAII-NEXT: v_readlane_b32 s37, v40, 3
+; HAWAII-NEXT: v_readlane_b32 s36, v40, 2
+; HAWAII-NEXT: v_readlane_b32 s35, v40, 1
+; HAWAII-NEXT: v_readlane_b32 s34, v40, 0
; HAWAII-NEXT: s_mov_b32 s32, s33
; HAWAII-NEXT: v_readlane_b32 s4, v40, 18
; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s16, 18
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s35, 3
-; GFX9-NEXT: v_writelane_b32 v40, s36, 4
-; GFX9-NEXT: v_writelane_b32 v40, s37, 5
-; GFX9-NEXT: v_writelane_b32 v40, s38, 6
-; GFX9-NEXT: v_writelane_b32 v40, s39, 7
-; GFX9-NEXT: v_writelane_b32 v40, s48, 8
-; GFX9-NEXT: v_writelane_b32 v40, s49, 9
-; GFX9-NEXT: v_writelane_b32 v40, s50, 10
-; GFX9-NEXT: v_writelane_b32 v40, s51, 11
-; GFX9-NEXT: v_writelane_b32 v40, s52, 12
-; GFX9-NEXT: v_writelane_b32 v40, s53, 13
-; GFX9-NEXT: v_writelane_b32 v40, s54, 14
-; GFX9-NEXT: v_writelane_b32 v40, s55, 15
-; GFX9-NEXT: v_writelane_b32 v40, s64, 16
+; GFX9-NEXT: v_writelane_b32 v40, s34, 0
+; GFX9-NEXT: v_writelane_b32 v40, s35, 1
+; GFX9-NEXT: v_writelane_b32 v40, s36, 2
+; GFX9-NEXT: v_writelane_b32 v40, s37, 3
+; GFX9-NEXT: v_writelane_b32 v40, s38, 4
+; GFX9-NEXT: v_writelane_b32 v40, s39, 5
+; GFX9-NEXT: v_writelane_b32 v40, s48, 6
+; GFX9-NEXT: v_writelane_b32 v40, s49, 7
+; GFX9-NEXT: v_writelane_b32 v40, s50, 8
+; GFX9-NEXT: v_writelane_b32 v40, s51, 9
+; GFX9-NEXT: v_writelane_b32 v40, s52, 10
+; GFX9-NEXT: v_writelane_b32 v40, s53, 11
+; GFX9-NEXT: v_writelane_b32 v40, s54, 12
+; GFX9-NEXT: v_writelane_b32 v40, s55, 13
+; GFX9-NEXT: v_writelane_b32 v40, s64, 14
+; GFX9-NEXT: v_writelane_b32 v40, s65, 15
+; GFX9-NEXT: v_writelane_b32 v40, s30, 16
; GFX9-NEXT: s_mov_b32 s50, s15
; GFX9-NEXT: s_mov_b32 s51, s14
; GFX9-NEXT: s_mov_b32 s52, s13
@@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
; GFX9-NEXT: s_mov_b64 s[54:55], exec
; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s65, 17
+; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
; GFX9-NEXT: v_readfirstlane_b32 s17, v1
@@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: s_cbranch_execnz .LBB18_1
; GFX9-NEXT: ; %bb.2:
; GFX9-NEXT: s_mov_b64 exec, s[54:55]
+; GFX9-NEXT: v_readlane_b32 s30, v40, 16
; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_readlane_b32 s65, v40, 17
-; GFX9-NEXT: v_readlane_b32 s64, v40, 16
-; GFX9-NEXT: v_readlane_b32 s55, v40, 15
-; GFX9-NEXT: v_readlane_b32 s54, v40, 14
-; GFX9-NEXT: v_readlane_b32 s53, v40, 13
-; GFX9-NEXT: v_readlane_b32 s52, v40, 12
-; GFX9-NEXT: v_readlane_b32 s51, v40, 11
-; GFX9-NEXT: v_readlane_b32 s50, v40, 10
-; GFX9-NEXT: v_readlane_b32 s49, v40, 9
-; GFX9-NEXT: v_readlane_b32 s48, v40, 8
-; GFX9-NEXT: v_readlane_b32 s39, v40, 7
-; GFX9-NEXT: v_readlane_b32 s38, v40, 6
-; GFX9-NEXT: v_readlane_b32 s37, v40, 5
-; GFX9-NEXT: v_readlane_b32 s36, v40, 4
-; GFX9-NEXT: v_readlane_b32 s35, v40, 3
-; GFX9-NEXT: v_readlane_b32 s34, v40, 2
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 17
+; GFX9-NEXT: v_readlane_b32 s65, v40, 15
+; GFX9-NEXT: v_readlane_b32 s64, v40, 14
+; GFX9-NEXT: v_readlane_b32 s55, v40, 13
+; GFX9-NEXT: v_readlane_b32 s54, v40, 12
+; GFX9-NEXT: v_readlane_b32 s53, v40, 11
+; GFX9-NEXT: v_readlane_b32 s52, v40, 10
+; GFX9-NEXT: v_readlane_b32 s51, v40, 9
+; GFX9-NEXT: v_readlane_b32 s50, v40, 8
+; GFX9-NEXT: v_readlane_b32 s49, v40, 7
+; GFX9-NEXT: v_readlane_b32 s48, v40, 6
+; GFX9-NEXT: v_readlane_b32 s39, v40, 5
+; GFX9-NEXT: v_readlane_b32 s38, v40, 4
+; GFX9-NEXT: v_readlane_b32 s37, v40, 3
+; GFX9-NEXT: v_readlane_b32 s36, v40, 2
+; GFX9-NEXT: v_readlane_b32 s35, v40, 1
+; GFX9-NEXT: v_readlane_b32 s34, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 18
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 52d9a30ffb8ba..1c9ee01807a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s34, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index d2394bab82c77..06f1c08d1f04e 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1283,11 +1283,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-OPT-NEXT: s_mov_b32 s32, s18
+; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE32-OPT-NEXT: ;;#ASMSTART
; WAVE32-OPT-NEXT: ; use s19
; WAVE32-OPT-NEXT: ;;#ASMEND
; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1
-; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE32-OPT-NEXT: s_mov_b32 s32, s33
; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1318,11 +1318,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE64-OPT-NEXT: s_mov_b32 s32, s18
+; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE64-OPT-NEXT: ;;#ASMSTART
; WAVE64-OPT-NEXT: ; use s19
; WAVE64-OPT-NEXT: ;;#ASMEND
; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1
-; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE64-OPT-NEXT: s_mov_b32 s32, s33
; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1
; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1431,8 +1431,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-NEXT: ; use s5
; WAVE32-O0-NEXT: ;;#ASMEND
; WAVE32-O0-NEXT: s_mov_b32 s32, s4
-; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE32-O0-NEXT: s_mov_b32 s32, s33
; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1542,8 +1542,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-O0-NEXT: ; use s5
; WAVE64-O0-NEXT: ;;#ASMEND
; WAVE64-O0-NEXT: s_mov_b32 s32, s4
-; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE64-O0-NEXT: s_mov_b32 s32, s33
; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1653,8 +1653,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-WWM-PREALLOC-NEXT: ; use s5
; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s33
; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 5c83491e7289e..4232a09d9fc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -194,8 +194,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -230,8 +230,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -266,8 +266,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -309,8 +309,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -381,8 +381,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_dword v[40:41], v4
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -468,8 +468,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_dword v[40:41], v8
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -518,6 +518,7 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
@@ -527,7 +528,6 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
-; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 5c6fcd4f977e3..13cde61ff16a0 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -18,11 +18,12 @@ define void @test_load_zext() #0 {
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: s_mov_b32 s0, DescriptorBuffer at abs32@lo
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index d76b2e5f2358d..837b66dadbba5 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -26,8 +26,8 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
; CHECK-NEXT: ; illegal copy v0 to s0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -62,8 +62,8 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: ; illegal copy v0 to s0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e78d62561238b..e5215fe1acdef 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_or_saveexec_b32 s16, -1
; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 exec_lo, s16
-; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_add_i32 s32, s32, 16
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
; GCN-NEXT: s_and_b32 s35, s0, s3
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s35
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 3
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s0, v40, 4
; GCN-NEXT: s_or_saveexec_b32 s1, -1
@@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_xor_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
; GCN-NEXT: s_and_b32 s35, s0, s3
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 3
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s0, v40, 4
; GCN-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 25e8581fb6cdd..639dcdcbf1c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -14,22 +14,22 @@ define hidden void @widget() {
; GCN-NEXT: v_writelane_b32 v41, s16, 16
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v41, s30, 0
-; GCN-NEXT: v_writelane_b32 v41, s31, 1
-; GCN-NEXT: v_writelane_b32 v41, s34, 2
-; GCN-NEXT: v_writelane_b32 v41, s35, 3
-; GCN-NEXT: v_writelane_b32 v41, s36, 4
-; GCN-NEXT: v_writelane_b32 v41, s37, 5
-; GCN-NEXT: v_writelane_b32 v41, s38, 6
-; GCN-NEXT: v_writelane_b32 v41, s39, 7
-; GCN-NEXT: v_writelane_b32 v41, s48, 8
-; GCN-NEXT: v_writelane_b32 v41, s49, 9
-; GCN-NEXT: v_writelane_b32 v41, s50, 10
-; GCN-NEXT: v_writelane_b32 v41, s51, 11
-; GCN-NEXT: v_writelane_b32 v41, s52, 12
-; GCN-NEXT: v_writelane_b32 v41, s53, 13
-; GCN-NEXT: v_writelane_b32 v41, s54, 14
-; GCN-NEXT: v_writelane_b32 v41, s55, 15
+; GCN-NEXT: v_writelane_b32 v41, s34, 0
+; GCN-NEXT: v_writelane_b32 v41, s35, 1
+; GCN-NEXT: v_writelane_b32 v41, s36, 2
+; GCN-NEXT: v_writelane_b32 v41, s37, 3
+; GCN-NEXT: v_writelane_b32 v41, s38, 4
+; GCN-NEXT: v_writelane_b32 v41, s39, 5
+; GCN-NEXT: v_writelane_b32 v41, s48, 6
+; GCN-NEXT: v_writelane_b32 v41, s49, 7
+; GCN-NEXT: v_writelane_b32 v41, s50, 8
+; GCN-NEXT: v_writelane_b32 v41, s51, 9
+; GCN-NEXT: v_writelane_b32 v41, s52, 10
+; GCN-NEXT: v_writelane_b32 v41, s53, 11
+; GCN-NEXT: v_writelane_b32 v41, s54, 12
+; GCN-NEXT: v_writelane_b32 v41, s55, 13
+; GCN-NEXT: v_writelane_b32 v41, s30, 14
+; GCN-NEXT: v_writelane_b32 v41, s31, 15
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_load_dword v0, v[0:1]
@@ -93,22 +93,22 @@ define hidden void @widget() {
; GCN-NEXT: s_addc_u32 s17, s17, wibble at rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT: v_readlane_b32 s55, v41, 15
-; GCN-NEXT: v_readlane_b32 s54, v41, 14
-; GCN-NEXT: v_readlane_b32 s53, v41, 13
-; GCN-NEXT: v_readlane_b32 s52, v41, 12
-; GCN-NEXT: v_readlane_b32 s51, v41, 11
-; GCN-NEXT: v_readlane_b32 s50, v41, 10
-; GCN-NEXT: v_readlane_b32 s49, v41, 9
-; GCN-NEXT: v_readlane_b32 s48, v41, 8
-; GCN-NEXT: v_readlane_b32 s39, v41, 7
-; GCN-NEXT: v_readlane_b32 s38, v41, 6
-; GCN-NEXT: v_readlane_b32 s37, v41, 5
-; GCN-NEXT: v_readlane_b32 s36, v41, 4
-; GCN-NEXT: v_readlane_b32 s35, v41, 3
-; GCN-NEXT: v_readlane_b32 s34, v41, 2
-; GCN-NEXT: v_readlane_b32 s31, v41, 1
-; GCN-NEXT: v_readlane_b32 s30, v41, 0
+; GCN-NEXT: v_readlane_b32 s30, v41, 14
+; GCN-NEXT: v_readlane_b32 s31, v41, 15
+; GCN-NEXT: v_readlane_b32 s55, v41, 13
+; GCN-NEXT: v_readlane_b32 s54, v41, 12
+; GCN-NEXT: v_readlane_b32 s53, v41, 11
+; GCN-NEXT: v_readlane_b32 s52, v41, 10
+; GCN-NEXT: v_readlane_b32 s51, v41, 9
+; GCN-NEXT: v_readlane_b32 s50, v41, 8
+; GCN-NEXT: v_readlane_b32 s49, v41, 7
+; GCN-NEXT: v_readlane_b32 s48, v41, 6
+; GCN-NEXT: v_readlane_b32 s39, v41, 5
+; GCN-NEXT: v_readlane_b32 s38, v41, 4
+; GCN-NEXT: v_readlane_b32 s37, v41, 3
+; GCN-NEXT: v_readlane_b32 s36, v41, 2
+; GCN-NEXT: v_readlane_b32 s35, v41, 1
+; GCN-NEXT: v_readlane_b32 s34, v41, 0
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v41, 16
@@ -266,32 +266,32 @@ define hidden void @blam() {
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v45, s30, 0
-; GCN-NEXT: v_writelane_b32 v45, s31, 1
-; GCN-NEXT: v_writelane_b32 v45, s34, 2
-; GCN-NEXT: v_writelane_b32 v45, s35, 3
-; GCN-NEXT: v_writelane_b32 v45, s36, 4
-; GCN-NEXT: v_writelane_b32 v45, s37, 5
-; GCN-NEXT: v_writelane_b32 v45, s38, 6
-; GCN-NEXT: v_writelane_b32 v45, s39, 7
-; GCN-NEXT: v_writelane_b32 v45, s48, 8
-; GCN-NEXT: v_writelane_b32 v45, s49, 9
-; GCN-NEXT: v_writelane_b32 v45, s50, 10
-; GCN-NEXT: v_writelane_b32 v45, s51, 11
-; GCN-NEXT: v_writelane_b32 v45, s52, 12
-; GCN-NEXT: v_writelane_b32 v45, s53, 13
-; GCN-NEXT: v_writelane_b32 v45, s54, 14
-; GCN-NEXT: v_writelane_b32 v45, s55, 15
-; GCN-NEXT: v_writelane_b32 v45, s64, 16
-; GCN-NEXT: v_writelane_b32 v45, s65, 17
-; GCN-NEXT: v_writelane_b32 v45, s66, 18
-; GCN-NEXT: v_writelane_b32 v45, s67, 19
-; GCN-NEXT: v_writelane_b32 v45, s68, 20
-; GCN-NEXT: v_writelane_b32 v45, s69, 21
-; GCN-NEXT: v_writelane_b32 v45, s70, 22
-; GCN-NEXT: v_writelane_b32 v45, s71, 23
-; GCN-NEXT: v_writelane_b32 v45, s80, 24
-; GCN-NEXT: v_writelane_b32 v45, s81, 25
+; GCN-NEXT: v_writelane_b32 v45, s34, 0
+; GCN-NEXT: v_writelane_b32 v45, s35, 1
+; GCN-NEXT: v_writelane_b32 v45, s36, 2
+; GCN-NEXT: v_writelane_b32 v45, s37, 3
+; GCN-NEXT: v_writelane_b32 v45, s38, 4
+; GCN-NEXT: v_writelane_b32 v45, s39, 5
+; GCN-NEXT: v_writelane_b32 v45, s48, 6
+; GCN-NEXT: v_writelane_b32 v45, s49, 7
+; GCN-NEXT: v_writelane_b32 v45, s50, 8
+; GCN-NEXT: v_writelane_b32 v45, s51, 9
+; GCN-NEXT: v_writelane_b32 v45, s52, 10
+; GCN-NEXT: v_writelane_b32 v45, s53, 11
+; GCN-NEXT: v_writelane_b32 v45, s54, 12
+; GCN-NEXT: v_writelane_b32 v45, s55, 13
+; GCN-NEXT: v_writelane_b32 v45, s64, 14
+; GCN-NEXT: v_writelane_b32 v45, s65, 15
+; GCN-NEXT: v_writelane_b32 v45, s66, 16
+; GCN-NEXT: v_writelane_b32 v45, s67, 17
+; GCN-NEXT: v_writelane_b32 v45, s68, 18
+; GCN-NEXT: v_writelane_b32 v45, s69, 19
+; GCN-NEXT: v_writelane_b32 v45, s70, 20
+; GCN-NEXT: v_writelane_b32 v45, s71, 21
+; GCN-NEXT: v_writelane_b32 v45, s80, 22
+; GCN-NEXT: v_writelane_b32 v45, s81, 23
+; GCN-NEXT: v_writelane_b32 v45, s30, 24
+; GCN-NEXT: v_writelane_b32 v45, s31, 25
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_mov_b32 s54, s15
; GCN-NEXT: s_mov_b32 s55, s14
@@ -427,32 +427,32 @@ define hidden void @blam() {
; GCN-NEXT: s_branch .LBB1_1
; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
; GCN-NEXT: s_or_b64 exec, exec, s[66:67]
-; GCN-NEXT: v_readlane_b32 s81, v45, 25
-; GCN-NEXT: v_readlane_b32 s80, v45, 24
-; GCN-NEXT: v_readlane_b32 s71, v45, 23
-; GCN-NEXT: v_readlane_b32 s70, v45, 22
-; GCN-NEXT: v_readlane_b32 s69, v45, 21
-; GCN-NEXT: v_readlane_b32 s68, v45, 20
-; GCN-NEXT: v_readlane_b32 s67, v45, 19
-; GCN-NEXT: v_readlane_b32 s66, v45, 18
-; GCN-NEXT: v_readlane_b32 s65, v45, 17
-; GCN-NEXT: v_readlane_b32 s64, v45, 16
-; GCN-NEXT: v_readlane_b32 s55, v45, 15
-; GCN-NEXT: v_readlane_b32 s54, v45, 14
-; GCN-NEXT: v_readlane_b32 s53, v45, 13
-; GCN-NEXT: v_readlane_b32 s52, v45, 12
-; GCN-NEXT: v_readlane_b32 s51, v45, 11
-; GCN-NEXT: v_readlane_b32 s50, v45, 10
-; GCN-NEXT: v_readlane_b32 s49, v45, 9
-; GCN-NEXT: v_readlane_b32 s48, v45, 8
-; GCN-NEXT: v_readlane_b32 s39, v45, 7
-; GCN-NEXT: v_readlane_b32 s38, v45, 6
-; GCN-NEXT: v_readlane_b32 s37, v45, 5
-; GCN-NEXT: v_readlane_b32 s36, v45, 4
-; GCN-NEXT: v_readlane_b32 s35, v45, 3
-; GCN-NEXT: v_readlane_b32 s34, v45, 2
-; GCN-NEXT: v_readlane_b32 s31, v45, 1
-; GCN-NEXT: v_readlane_b32 s30, v45, 0
+; GCN-NEXT: v_readlane_b32 s30, v45, 24
+; GCN-NEXT: v_readlane_b32 s31, v45, 25
+; GCN-NEXT: v_readlane_b32 s81, v45, 23
+; GCN-NEXT: v_readlane_b32 s80, v45, 22
+; GCN-NEXT: v_readlane_b32 s71, v45, 21
+; GCN-NEXT: v_readlane_b32 s70, v45, 20
+; GCN-NEXT: v_readlane_b32 s69, v45, 19
+; GCN-NEXT: v_readlane_b32 s68, v45, 18
+; GCN-NEXT: v_readlane_b32 s67, v45, 17
+; GCN-NEXT: v_readlane_b32 s66, v45, 16
+; GCN-NEXT: v_readlane_b32 s65, v45, 15
+; GCN-NEXT: v_readlane_b32 s64, v45, 14
+; GCN-NEXT: v_readlane_b32 s55, v45, 13
+; GCN-NEXT: v_readlane_b32 s54, v45, 12
+; GCN-NEXT: v_readlane_b32 s53, v45, 11
+; GCN-NEXT: v_readlane_b32 s52, v45, 10
+; GCN-NEXT: v_readlane_b32 s51, v45, 9
+; GCN-NEXT: v_readlane_b32 s50, v45, 8
+; GCN-NEXT: v_readlane_b32 s49, v45, 7
+; GCN-NEXT: v_readlane_b32 s48, v45, 6
+; GCN-NEXT: v_readlane_b32 s39, v45, 5
+; GCN-NEXT: v_readlane_b32 s38, v45, 4
+; GCN-NEXT: v_readlane_b32 s37, v45, 3
+; GCN-NEXT: v_readlane_b32 s36, v45, 2
+; GCN-NEXT: v_readlane_b32 s35, v45, 1
+; GCN-NEXT: v_readlane_b32 s34, v45, 0
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 9ce8859c410fc..c902ac6173935 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: v_readlane_b32 s30, v44, 0
+; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v44, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s4, v44, 2
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
@@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: v_readlane_b32 s30, v44, 0
+; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v44, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v45, 1
; GFX9-NEXT: v_readlane_b32 s30, v45, 0
+; GFX9-NEXT: v_readlane_b32 s31, v45, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v45, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10-NEXT: v_readlane_b32 s31, v45, 1
; GFX10-NEXT: v_readlane_b32 s30, v45, 0
+; GFX10-NEXT: v_readlane_b32 s31, v45, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s4, v45, 2
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
@@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16
-; GFX11-NEXT: v_readlane_b32 s31, v45, 1
; GFX11-NEXT: v_readlane_b32 s30, v45, 0
+; GFX11-NEXT: v_readlane_b32 s31, v45, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v45, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 1805add14e47a..a76f4495f85d5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3086,8 +3086,8 @@ define void @callee_no_stack_with_call() #1 {
; GFX1032-NEXT: v_writelane_b32 v40, s31, 1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_readlane_b32 s31, v40, 1
; GFX1032-NEXT: v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT: v_readlane_b32 s31, v40, 1
; GFX1032-NEXT: s_mov_b32 s32, s33
; GFX1032-NEXT: v_readlane_b32 s4, v40, 2
; GFX1032-NEXT: s_or_saveexec_b32 s5, -1
@@ -3117,8 +3117,8 @@ define void @callee_no_stack_with_call() #1 {
; GFX1064-NEXT: v_writelane_b32 v40, s31, 1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_readlane_b32 s31, v40, 1
; GFX1064-NEXT: v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT: v_readlane_b32 s31, v40, 1
; GFX1064-NEXT: s_mov_b32 s32, s33
; GFX1064-NEXT: v_readlane_b32 s4, v40, 2
; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 50195bc5bf414..087eecbfda19e 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 2
; GISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GISEL-NEXT: v_readlane_b32 s31, v40, 2
; GISEL-NEXT: v_readlane_b32 s4, v40, 0
; GISEL-NEXT: v_readlane_b32 s0, v40, 3
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL64-NEXT: s_wait_alu 0xfffe
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1
; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0
; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4
@@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL64-NEXT: s_wait_alu 0xfffe
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
; GISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
; GISEL64-NEXT: v_readlane_b32 s5, v40, 1
; GISEL64-NEXT: v_readlane_b32 s4, v40, 0
; GISEL64-NEXT: v_readlane_b32 s0, v40, 4
@@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -8049,8 +8049,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
-; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
; DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
@@ -8390,8 +8390,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: flat_store_b32 v[40:41], v0
-; GISEL-NEXT: v_readlane_b32 s31, v42, 2
; GISEL-NEXT: v_readlane_b32 s30, v42, 1
+; GISEL-NEXT: v_readlane_b32 s31, v42, 2
; GISEL-NEXT: v_readlane_b32 s4, v42, 0
; GISEL-NEXT: v_readlane_b32 s0, v42, 3
; GISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
@@ -8733,8 +8733,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL64-NEXT: s_wait_alu 0xfffe
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
-; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1
; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0
; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4
@@ -9077,8 +9077,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL64-NEXT: s_wait_alu 0xfffe
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL64-NEXT: flat_store_b32 v[40:41], v0
-; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
; GISEL64-NEXT: v_readlane_b32 s5, v42, 1
; GISEL64-NEXT: v_readlane_b32 s4, v42, 0
; GISEL64-NEXT: v_readlane_b32 s0, v42, 4
@@ -10198,8 +10198,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-DAGISEL-NEXT: flat_store_b32 v[40:41], v0
-; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
+; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 06c451869e841..3fe54cd045c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1
; GFX90A-NEXT: v_accvgpr_read_b32 v39, a32
; GFX90A-NEXT: s_mov_b64 exec, s[28:29]
+; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: v_readlane_b32 s20, v39, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s20
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: s_mov_b32 s32, s33
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 9e9fe1809c780..b5c0023bd5c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -49,10 +49,10 @@ define void @test() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v39, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 4
; GCN-NEXT: v_readlane_b32 s28, v40, 2
@@ -111,8 +111,8 @@ define void @test() #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: global_store_dword v[0:1], v2, off
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
; GCN-O0-NEXT: s_mov_b32 s32, s33
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e9a0671ead4e0..fe641367944be 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: s_mov_b32 s32, s33
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: s_mov_b32 s32, s33
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -622,8 +622,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_mov_b32 s34, 0
; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: s_mov_b32 s32, s33
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -683,9 +683,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: s_mov_b32 s32, s33
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
More information about the llvm-branch-commits
mailing list