[llvm-branch-commits] [llvm] [amdgpu-cfi: 6/9]: [AMDGPU] Use register pair for PC spill (PR #183146)

Scott Linder via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Apr 6 13:48:24 PDT 2026


https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/183146

>From 513df2b4d8ca0e71ede4e66831284932721fd32a Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder at amd.com>
Date: Wed, 29 Oct 2025 18:46:12 +0000
Subject: [PATCH] [AMDGPU] Use register pair for PC spill

Change-Id: Ibedeef926f7ff235a06de65a83087c151f66a416
---
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |   21 +
 .../CodeGen/AMDGPU/GlobalISel/assert-align.ll |    2 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |    8 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    2 +-
 .../abi-attribute-hints-undefined-behavior.ll |    2 +-
 .../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll  |    8 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 8662 ++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll   |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll    |   26 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll   |  123 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll   |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  266 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll    |  170 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll   |  123 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll   |  122 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll    |   26 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 3122 +++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |  186 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  858 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll    |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   | 1650 ++--
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   | 2058 ++--
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   | 2474 ++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   | 2922 +++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   | 3482 +++----
 .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll    |  122 +-
 .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll |    4 +-
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     |   54 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  285 +-
 ...tor-flatscratchinit-undefined-behavior2.ll |   13 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  998 +-
 .../test/CodeGen/AMDGPU/branch-relax-spill.ll |  156 +-
 .../CodeGen/AMDGPU/call-args-inreg-bfloat.ll  |    8 +-
 .../call-args-inreg-no-sgpr-for-csrspill.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   |   88 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   40 +-
 .../AMDGPU/call-graph-register-usage.ll       |    2 +-
 .../AMDGPU/call-preserved-registers.ll        |  116 +-
 llvm/test/CodeGen/AMDGPU/call-skip.ll         |    2 +-
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll |  106 +-
 .../callee-special-input-vgprs-packed.ll      |   14 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |   14 +-
 llvm/test/CodeGen/AMDGPU/cc-entry.ll          |    2 +-
 .../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll       |    4 +-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   56 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/debug-frame.ll       |    8 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   64 +-
 .../dynamic-vgpr-reserve-stack-for-cwsr.ll    |    4 +-
 .../AMDGPU/eliminate-frame-index-select.ll    |   60 +-
 .../fix-frame-reg-in-custom-csr-spills.ll     |    2 +-
 ...frame-setup-without-sgpr-to-vgpr-spills.ll |   25 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |    8 +-
 .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll   |  144 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 1226 +--
 .../gfx-callable-preserved-registers.ll       |   72 +-
 .../AMDGPU/gfx-callable-return-types.ll       |   42 +-
 llvm/test/CodeGen/AMDGPU/global-alias.ll      |    2 +-
 .../identical-subrange-spill-infloop.ll       |   96 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     | 1104 +--
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    2 +-
 .../CodeGen/AMDGPU/insert-waitcnts-crash.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/issue176578.ll       |   60 +-
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll |    7 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll |    7 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll |    7 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll |    7 +
 .../AMDGPU/materialize-frame-index-sgpr.ll    | 1634 ++--
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   20 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |    6 +-
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |    4 +-
 .../AMDGPU/no-source-locations-in-prologue.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/nofpclass-call.ll    |   12 +-
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |   25 +-
 .../promote-constOffset-to-imm-gfx12.mir      |    6 +
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |    6 +-
 .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir |  190 +-
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   27 +-
 .../AMDGPU/shufflevector.v2i64.v8i64.ll       |  299 +-
 .../si-lower-sgpr-spills-vgpr-lanes-usage.mir |   18 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  222 +-
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |    2 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |   10 +-
 .../AMDGPU/strictfp_f16_abi_promote.ll        |   14 +-
 .../CodeGen/AMDGPU/swdev504645-global-fold.ll |    3 +-
 .../AMDGPU/tail-call-inreg-arguments.error.ll |    4 +-
 ...unfold-masked-merge-scalar-variablemask.ll |   38 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |  168 +-
 .../AMDGPU/vgpr-mark-last-scratch-load.ll     |   56 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |   12 +-
 ...terfall-call-target-av-register-failure.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |    4 +-
 .../CodeGen/AMDGPU/whole-wave-functions.ll    |   25 +-
 .../AMDGPU/whole-wave-register-copy.ll        |    2 +-
 .../AMDGPU/whole-wave-register-spill.ll       |    4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |    8 +-
 101 files changed, 17190 insertions(+), 17891 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 06b221e171fd4..9a3ffc2d35462 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -268,11 +268,20 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
 
     std::vector<CalleeSavedInfo> CSI;
     const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+    MCRegister RetAddrReg = TRI->getReturnAddressReg(MF);
+    MCRegister RetAddrRegSub0 = TRI->getSubReg(RetAddrReg, AMDGPU::sub0);
+    MCRegister RetAddrRegSub1 = TRI->getSubReg(RetAddrReg, AMDGPU::sub1);
+    bool SpillRetAddrReg = false;
 
     for (unsigned I = 0; CSRegs[I]; ++I) {
       MCRegister Reg = CSRegs[I];
 
       if (SavedRegs.test(Reg)) {
+        if (Reg == RetAddrRegSub0 || Reg == RetAddrRegSub1) {
+          SpillRetAddrReg = true;
+          continue;
+        }
+
         const TargetRegisterClass *RC =
           TRI->getMinimalPhysRegClass(Reg, MVT::i32);
         int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
@@ -283,6 +292,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
       }
     }
 
+    // Return address uses a register pair. Add the super register to the
+    // CSI list so that it's easier to identify the entire spill and CFI
+    // can be emitted appropriately.
+    if (SpillRetAddrReg) {
+      const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+      int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+                                         TRI->getSpillAlign(*RC), true);
+      CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+      CalleeSavedFIs.push_back(JunkFI);
+    }
+
     if (!CSI.empty()) {
       for (MachineBasicBlock *SaveBlock : SaveBlocks)
         insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 62fe5f101b458..bd808190f6eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7e6f500181ec6..2c1beb8468576 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
 ; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 0ecb3e8e94f0c..8225ea1f8fda7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -245,8 +245,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], 0
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..0e24430e7be20 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    s_mov_b32 s32, s33
 ; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 2
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index c78544bee46a2..60ce2ce2d99ae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index f5ccbac9773e7..9478af1bf1cc9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -7885,42 +7885,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -7940,7 +7940,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v2
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
@@ -8810,6 +8810,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v22, 11
 ; SI-NEXT:    v_readlane_b32 s17, v22, 17
 ; SI-NEXT:    v_readlane_b32 s15, v22, 23
@@ -8817,42 +8818,41 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 35
 ; SI-NEXT:    v_readlane_b32 s9, v22, 41
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -9049,38 +9049,38 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -9100,7 +9100,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -9759,40 +9759,40 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    v_readlane_b32 s7, v33, 1
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -9963,42 +9963,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v29, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v29, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v29, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v29, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v29, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v29, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v29, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v29, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v29, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v29, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v29, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v29, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v29, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v29, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v29, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v29, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v29, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v29, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v29, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v29, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v29, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v29, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v29, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v29, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v29, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v29, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v29, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v29, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v29, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v29, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v29, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v29, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v29, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v29, s97, 33
+; GFX9-NEXT:    v_writelane_b32 v29, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v29, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v29, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v29, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v29, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v29, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v29, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v29, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v29, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v29, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v29, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v29, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v29, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v29, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v29, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v29, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v29, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v29, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v29, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v29, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v29, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v29, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v29, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v29, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v29, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v29, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v29, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v29, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v29, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v29, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v29, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v29, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v29, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v29, s99, 33
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v29, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v29, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -10018,7 +10018,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s44, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
-; GFX9-NEXT:    v_writelane_b32 v29, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v29, s31, 35
 ; GFX9-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -10623,43 +10623,43 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v1, s4, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_readlane_b32 s30, v29, 34
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    v_readlane_b32 s99, v29, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v29, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v29, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v29, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v29, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v29, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v29, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v29, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v29, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v29, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v29, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v29, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v29, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v29, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v29, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v29, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v29, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v29, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v29, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v29, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v29, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v29, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v29, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v29, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v29, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v29, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v29, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v29, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v29, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v29, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v29, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v29, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v29, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v29, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v29, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v29, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v29, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v29, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v29, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v29, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v29, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v29, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v29, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v29, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v29, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v29, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v29, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v29, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v29, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v29, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v29, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v29, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v29, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v29, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v29, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v29, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v29, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v29, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v29, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v29, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v29, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v29, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v29, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v29, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v29, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v29, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v29, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v29, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v29, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v29, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v29, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -10825,66 +10825,66 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v26, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v27, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v24, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v25, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v24, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v25, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v24, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v25, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v24, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v25, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v24, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v25, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v24, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v25, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v24, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v25, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v24, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v25, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v24, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v25, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v24, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v25, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v24, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v25, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v24, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v25, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v24, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v25, s102, 6
-; GFX11-NEXT:    v_writelane_b32 v24, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v25, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v24, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v25, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v24, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v24, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v24, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v24, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v24, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v24, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v24, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v24, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v24, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v24, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v24, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v24, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v24, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v24, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v24, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v24, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v24, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v24, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v24, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v24, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v24, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v24, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v24, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v24, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v25, s104, 6
+; GFX11-NEXT:    v_writelane_b32 v24, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v25, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v24, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v25, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v24, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v24, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v24, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v24, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v24, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v24, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v24, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v24, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v24, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v24, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v24, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v24, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v24, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v24, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v24, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v24, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v24, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v24, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v24, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v24, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v24, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v24, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v24, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -11213,11 +11213,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
 ; GFX11-NEXT:  .LBB13_3: ; %end
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xc0c0004
-; GFX11-NEXT:    v_readlane_b32 s31, v24, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v2, s103, s30, v1
-; GFX11-NEXT:    v_readlane_b32 s103, v25, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v24, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v25, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s103, v25, 5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v19, s83, s81, v1
 ; GFX11-NEXT:    v_perm_b32 v3, s0, s104, v1
@@ -11298,95 +11298,95 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 1
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 31
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
-; GFX11-NEXT:    v_readlane_b32 s104, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v25, 6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s62, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 2
 ; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s102, v25, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v25, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v25, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v25, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s40, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 30
-; GFX11-NEXT:    v_readlane_b32 s100, v25, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v25, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s100, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v25, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v25, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 0
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 26
-; GFX11-NEXT:    v_readlane_b32 s97, v25, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v25, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v24, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v24, 30
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v5, s41, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 28
-; GFX11-NEXT:    v_readlane_b32 s87, v24, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v24, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s87, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v24, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v24, 27
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s60, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 25
-; GFX11-NEXT:    v_readlane_b32 s84, v24, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v24, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v24, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v24, 25
 ; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off offset:64
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v6, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 29
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 21
-; GFX11-NEXT:    v_readlane_b32 s82, v24, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v24, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v24, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v24, 23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX11-NEXT:    v_perm_b32 v8, s14, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 27
-; GFX11-NEXT:    v_readlane_b32 s80, v24, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v24, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s80, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v24, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v24, 20
 ; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s15, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 23
-; GFX11-NEXT:    v_readlane_b32 s69, v24, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v24, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s69, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v24, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v24, 17
 ; GFX11-NEXT:    v_or_b32_e32 v7, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s58, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 24
-; GFX11-NEXT:    v_readlane_b32 s66, v24, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v24, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s66, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v24, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v24, 14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v8, s12, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 20
-; GFX11-NEXT:    v_readlane_b32 s55, v24, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v24, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s55, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v24, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v24, 11
 ; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 22
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 16
-; GFX11-NEXT:    v_readlane_b32 s52, v24, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v24, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v24, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v24, 9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v9, s13, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 18
-; GFX11-NEXT:    v_readlane_b32 s50, v24, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v24, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s50, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v24, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v24, 6
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s56, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 15
-; GFX11-NEXT:    v_readlane_b32 s39, v24, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v24, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s39, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v24, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v24, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v2, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 19
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 11
-; GFX11-NEXT:    v_readlane_b32 s36, v24, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v24, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v24, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v4, s10, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 17
-; GFX11-NEXT:    v_readlane_b32 s34, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v24, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s11, s0, v1
@@ -22280,43 +22280,42 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s70, v17
 ; SI-NEXT:    v_readfirstlane_b32 s71, v16
 ; SI-NEXT:    v_readfirstlane_b32 s80, v15
@@ -22336,6 +22335,7 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s8, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s9, v0
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -22745,43 +22745,43 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s5
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26076,15 +26076,17 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -26162,13 +26164,11 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -26882,12 +26882,12 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30301,28 +30301,28 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -30342,7 +30342,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB21_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -30540,6 +30540,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -30572,28 +30573,27 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31888,45 +31888,46 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -31946,7 +31947,6 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -32380,42 +32380,42 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB23_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -33476,28 +33476,28 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -33517,7 +33517,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -33715,6 +33715,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -33747,28 +33748,27 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34893,45 +34893,46 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -34951,7 +34952,6 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -35256,6 +35256,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB27_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -35288,42 +35289,41 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -42711,43 +42711,43 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s45, v18
 ; SI-NEXT:    v_readfirstlane_b32 s44, v17
 ; SI-NEXT:    v_readfirstlane_b32 s47, v16
@@ -43673,32 +43673,31 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s4, s70, 8
 ; SI-NEXT:    s_lshl_b32 s5, s66, 24
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
@@ -43720,9 +43719,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v21
 ; SI-NEXT:    s_lshl_b32 s4, s64, 8
 ; SI-NEXT:    s_lshl_b32 s5, s52, 24
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
@@ -43754,8 +43753,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s18, 8
 ; SI-NEXT:    s_lshl_b32 s5, s48, 24
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43786,8 +43785,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s20, 8
 ; SI-NEXT:    s_lshl_b32 s5, s36, 24
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43818,8 +43817,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s22, 8
 ; SI-NEXT:    s_lshl_b32 s5, s34, 24
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43888,7 +43887,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s5, s90, 24
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v59
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, s4, v2
@@ -44044,39 +44044,39 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -44953,38 +44953,38 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v23, v24, v23, s4
 ; VI-NEXT:    v_perm_b32 v24, v34, v35, s4
@@ -45273,43 +45273,43 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -46152,42 +46152,42 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_perm_b32 v25, v25, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v26, v26, v45, s4
 ; GFX9-NEXT:    v_perm_b32 v23, v23, v43, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_perm_b32 v24, v24, v15, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -46481,33 +46481,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -46529,37 +46529,37 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB37_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -47283,47 +47283,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -58004,43 +58004,43 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v15
@@ -58514,42 +58514,42 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v50
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -58639,17 +58639,17 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -58742,16 +58742,16 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB41_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58764,17 +58764,17 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -58867,16 +58867,16 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB41_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -62008,15 +62008,17 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -62094,13 +62096,11 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -62814,12 +62814,12 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66217,29 +66217,29 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -66592,28 +66592,28 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v30, v30, v32
@@ -66631,17 +66631,17 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -66734,16 +66734,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB45_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66756,17 +66756,17 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -66859,16 +66859,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB45_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -67968,45 +67968,46 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -68026,7 +68027,6 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -68460,42 +68460,42 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -69540,29 +69540,29 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -69915,28 +69915,28 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v30, v30, v32
@@ -69954,17 +69954,17 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -70057,16 +70057,16 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB49_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -70079,17 +70079,17 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -70182,16 +70182,16 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB49_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -71121,45 +71121,46 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -71179,7 +71180,6 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -71484,6 +71484,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -71516,42 +71517,41 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -77366,42 +77366,42 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -77421,7 +77421,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v2
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
@@ -78291,6 +78291,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v22, 11
 ; SI-NEXT:    v_readlane_b32 s17, v22, 17
 ; SI-NEXT:    v_readlane_b32 s15, v22, 23
@@ -78298,42 +78299,41 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 35
 ; SI-NEXT:    v_readlane_b32 s9, v22, 39
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -78530,38 +78530,38 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -78581,7 +78581,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -79240,40 +79240,40 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    v_readlane_b32 s7, v33, 1
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -79444,42 +79444,42 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v29, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v29, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v29, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v29, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v29, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v29, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v29, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v29, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v29, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v29, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v29, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v29, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v29, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v29, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v29, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v29, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v29, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v29, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v29, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v29, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v29, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v29, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v29, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v29, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v29, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v29, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v29, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v29, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v29, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v29, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v29, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v29, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v29, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v29, s97, 33
+; GFX9-NEXT:    v_writelane_b32 v29, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v29, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v29, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v29, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v29, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v29, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v29, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v29, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v29, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v29, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v29, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v29, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v29, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v29, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v29, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v29, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v29, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v29, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v29, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v29, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v29, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v29, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v29, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v29, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v29, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v29, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v29, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v29, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v29, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v29, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v29, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v29, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v29, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v29, s99, 33
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v29, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v29, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -79499,7 +79499,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s44, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
-; GFX9-NEXT:    v_writelane_b32 v29, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v29, s31, 35
 ; GFX9-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB57_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -80104,43 +80104,43 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v1, s4, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_readlane_b32 s30, v29, 34
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    v_readlane_b32 s99, v29, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v29, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v29, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v29, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v29, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v29, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v29, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v29, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v29, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v29, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v29, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v29, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v29, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v29, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v29, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v29, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v29, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v29, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v29, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v29, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v29, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v29, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v29, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v29, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v29, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v29, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v29, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v29, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v29, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v29, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v29, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v29, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v29, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v29, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v29, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v29, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v29, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v29, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v29, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v29, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v29, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v29, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v29, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v29, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v29, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v29, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v29, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v29, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v29, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v29, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v29, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v29, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v29, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v29, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v29, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v29, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v29, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v29, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v29, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v29, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v29, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v29, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v29, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v29, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v29, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v29, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v29, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v29, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v29, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v29, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v29, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -80306,66 +80306,66 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v26, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v27, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v24, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v25, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v24, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v25, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v24, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v25, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v24, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v25, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v24, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v25, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v24, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v25, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v24, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v25, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v24, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v25, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v24, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v25, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v24, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v25, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v24, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v25, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v24, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v25, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v24, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v25, s102, 6
-; GFX11-NEXT:    v_writelane_b32 v24, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v25, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v24, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v25, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v24, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v24, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v24, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v24, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v24, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v24, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v24, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v24, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v24, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v24, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v24, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v24, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v24, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v24, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v24, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v24, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v24, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v24, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v24, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v24, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v24, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v24, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v24, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v24, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v25, s104, 6
+; GFX11-NEXT:    v_writelane_b32 v24, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v25, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v24, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v25, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v24, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v24, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v24, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v24, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v24, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v24, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v24, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v24, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v24, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v24, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v24, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v24, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v24, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v24, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v24, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v24, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v24, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v24, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v24, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v24, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v24, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v24, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v24, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB57_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -80699,11 +80699,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
 ; GFX11-NEXT:  .LBB57_3: ; %end
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xc0c0004
-; GFX11-NEXT:    v_readlane_b32 s31, v24, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v2, s103, s30, v1
-; GFX11-NEXT:    v_readlane_b32 s103, v25, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v24, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v25, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s103, v25, 5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v19, s83, s81, v1
 ; GFX11-NEXT:    v_perm_b32 v3, s0, s104, v1
@@ -80784,95 +80784,95 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 1
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 31
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
-; GFX11-NEXT:    v_readlane_b32 s104, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v25, 6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s62, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 2
 ; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s102, v25, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v25, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v25, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v25, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s40, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 30
-; GFX11-NEXT:    v_readlane_b32 s100, v25, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v25, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s100, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v25, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v25, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 0
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 26
-; GFX11-NEXT:    v_readlane_b32 s97, v25, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v25, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v24, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v24, 30
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v5, s41, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 28
-; GFX11-NEXT:    v_readlane_b32 s87, v24, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v24, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s87, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v24, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v24, 27
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s60, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 25
-; GFX11-NEXT:    v_readlane_b32 s84, v24, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v24, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v24, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v24, 25
 ; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off offset:64
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v6, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 29
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 21
-; GFX11-NEXT:    v_readlane_b32 s82, v24, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v24, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v24, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v24, 23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX11-NEXT:    v_perm_b32 v8, s14, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 27
-; GFX11-NEXT:    v_readlane_b32 s80, v24, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v24, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s80, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v24, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v24, 20
 ; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s15, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 23
-; GFX11-NEXT:    v_readlane_b32 s69, v24, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v24, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s69, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v24, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v24, 17
 ; GFX11-NEXT:    v_or_b32_e32 v7, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s58, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 24
-; GFX11-NEXT:    v_readlane_b32 s66, v24, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v24, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s66, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v24, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v24, 14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v8, s12, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 20
-; GFX11-NEXT:    v_readlane_b32 s55, v24, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v24, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s55, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v24, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v24, 11
 ; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 22
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 16
-; GFX11-NEXT:    v_readlane_b32 s52, v24, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v24, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v24, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v24, 9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v9, s13, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 18
-; GFX11-NEXT:    v_readlane_b32 s50, v24, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v24, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s50, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v24, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v24, 6
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s56, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 15
-; GFX11-NEXT:    v_readlane_b32 s39, v24, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v24, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s39, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v24, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v24, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v2, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 19
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 11
-; GFX11-NEXT:    v_readlane_b32 s36, v24, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v24, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v24, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v4, s10, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 17
-; GFX11-NEXT:    v_readlane_b32 s34, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v24, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s11, s0, v1
@@ -91774,43 +91774,42 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s70, v17
 ; SI-NEXT:    v_readfirstlane_b32 s71, v16
 ; SI-NEXT:    v_readfirstlane_b32 s80, v15
@@ -91830,6 +91829,7 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s8, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s9, v0
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB61_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -92137,43 +92137,43 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    v_readlane_b32 s4, v34, 1
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -95538,15 +95538,17 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -95624,13 +95626,11 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -96344,12 +96344,12 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -99771,28 +99771,28 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -99812,7 +99812,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB65_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -100010,6 +100010,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -100042,28 +100043,27 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -101358,45 +101358,46 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -101416,7 +101417,6 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -101850,42 +101850,42 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB67_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -102954,28 +102954,28 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -102995,7 +102995,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -103193,6 +103193,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -103225,28 +103226,27 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -104371,45 +104371,46 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -104429,7 +104430,6 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -104734,6 +104734,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB71_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -104766,42 +104767,41 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -109212,43 +109212,43 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -110113,38 +110113,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s37, 8
 ; SI-NEXT:    s_lshl_b32 s8, s35, 24
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -110177,9 +110177,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s34, 8
 ; SI-NEXT:    s_lshl_b32 s8, s30, 24
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v18
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -110213,7 +110212,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s95, 8
 ; SI-NEXT:    s_lshl_b32 s8, s93, 24
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
 ; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v55
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
@@ -110603,39 +110603,39 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -111467,38 +111467,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v29, v29, v44, s4
 ; VI-NEXT:    v_perm_b32 v27, v27, v33, s4
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v31, v43, v31, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
@@ -111831,43 +111831,43 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -112720,42 +112720,42 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_e32 v26, v39, v26
 ; GFX9-NEXT:    v_perm_b32 v24, v24, v58, s4
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v55, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_perm_b32 v34, v34, v15, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
@@ -113064,33 +113064,33 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -113112,37 +113112,37 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB73_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s11, 16
@@ -113852,47 +113852,47 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -124505,43 +124505,43 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -125003,42 +125003,42 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v38
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -125132,17 +125132,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -125219,16 +125219,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB77_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -125241,17 +125241,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -125328,16 +125328,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB77_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -128453,15 +128453,17 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -128539,13 +128541,11 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -129259,12 +129259,12 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -132615,29 +132615,29 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s45, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
 ; SI-NEXT:    v_readfirstlane_b32 s43, v15
@@ -132967,6 +132967,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v55
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v39
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v38
@@ -132974,28 +132975,27 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v36
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v35
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v34
@@ -133014,17 +133014,17 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -133101,16 +133101,16 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB81_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -133123,17 +133123,17 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -133210,16 +133210,16 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB81_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -134303,45 +134303,46 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -134361,7 +134362,6 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -134795,42 +134795,42 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB83_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -135828,29 +135828,29 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s45, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
 ; SI-NEXT:    v_readfirstlane_b32 s43, v15
@@ -136180,6 +136180,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v55
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v39
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v38
@@ -136187,28 +136188,27 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v36
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v35
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v34
@@ -136227,17 +136227,17 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -136314,16 +136314,16 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB85_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -136336,17 +136336,17 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -136423,16 +136423,16 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB85_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -137346,45 +137346,46 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -137404,7 +137405,6 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -137709,6 +137709,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB87_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -137741,42 +137742,41 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -144589,31 +144589,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
 ; SI-NEXT:    v_writelane_b32 v43, s4, 9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
 ; SI-NEXT:    v_writelane_b32 v43, s4, 10
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s4, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    v_writelane_b32 v43, s4, 12
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s4, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:328
@@ -144641,10 +144639,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:236
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    s_mov_b32 s37, s18
 ; SI-NEXT:    v_readfirstlane_b32 s18, v18
 ; SI-NEXT:    v_readfirstlane_b32 s38, v17
@@ -144654,10 +144652,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    s_mov_b32 s73, s19
 ; SI-NEXT:    v_readfirstlane_b32 s55, v14
 ; SI-NEXT:    v_readfirstlane_b32 s34, v13
@@ -144681,32 +144679,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s53, v6
 ; SI-NEXT:    v_readfirstlane_b32 s76, v5
 ; SI-NEXT:    v_readfirstlane_b32 s77, v4
 ; SI-NEXT:    v_readfirstlane_b32 s48, v3
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s6, s20
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s91, v19
 ; SI-NEXT:    v_readfirstlane_b32 s43, v20
@@ -144750,6 +144748,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    s_mov_b32 s6, s20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(14)
@@ -146006,43 +146006,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s83
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s84
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -156263,46 +156263,49 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s56, v15
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
 ; SI-NEXT:    s_and_b32 s57, s56, 0xffff0000
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s46, v16
 ; SI-NEXT:    s_lshl_b32 s56, s56, 16
 ; SI-NEXT:    v_readfirstlane_b32 s78, v8
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v3
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s57
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s44, v17
 ; SI-NEXT:    s_and_b32 s47, s46, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s46, s46, 16
@@ -156312,7 +156315,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s56
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s42, v18
 ; SI-NEXT:    s_and_b32 s45, s44, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
@@ -156332,7 +156334,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s46
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -156404,7 +156405,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
 ; SI-NEXT:    s_cmp_lg_u32 vcc_lo, 0
 ; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s41
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s16
@@ -157942,40 +157942,40 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    v_readlane_b32 s27, v62, 59
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
 ; SI-NEXT:    v_readlane_b32 s61, v62, 35
 ; SI-NEXT:    v_readlane_b32 s43, v62, 47
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s8, v1
@@ -158193,10 +158193,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_readlane_b32 s17, v61, 19
 ; SI-NEXT:    v_readlane_b32 s11, v61, 25
 ; SI-NEXT:    v_readlane_b32 s9, v61, 33
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
@@ -158230,39 +158230,39 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s4, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
@@ -159733,38 +159733,38 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v5, v32, v34, s4
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v4, v2, v4, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -160088,43 +160088,43 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -161532,42 +161532,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_perm_b32 v34, v56, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s4
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v32, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v25, v25, v30, s4
 ; GFX9-NEXT:    v_or_b32_e32 v25, v25, v26
@@ -161886,33 +161886,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-TRUE16-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -161934,37 +161934,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 24
@@ -163283,47 +163283,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -163344,33 +163344,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-FAKE16-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -163392,37 +163392,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 24
@@ -164746,47 +164746,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -171067,72 +171067,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
 ; SI-NEXT:    v_writelane_b32 v43, s26, 3
 ; SI-NEXT:    v_writelane_b32 v43, s25, 4
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s24, 5
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
 ; SI-NEXT:    v_writelane_b32 v43, s23, 6
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s22, 7
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s21, 8
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s20, 9
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s19, 10
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s18, 11
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s17, 12
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
 ; SI-NEXT:    v_writelane_b32 v43, s16, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v30
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    v_readfirstlane_b32 s71, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v15
@@ -171200,21 +171198,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s28, v3
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s56, v19
 ; SI-NEXT:    v_readfirstlane_b32 s47, v20
@@ -172528,6 +172528,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s16, s68, 16
 ; SI-NEXT:    s_or_b32 s4, s4, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s16
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s18
 ; SI-NEXT:    v_mov_b32_e32 v2, s19
@@ -172560,42 +172561,41 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -180352,60 +180352,61 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
 ; SI-NEXT:    s_lshr_b32 s6, s18, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v37, s5, 0
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
 ; SI-NEXT:    s_lshr_b32 s7, s20, 16
 ; SI-NEXT:    v_writelane_b32 v37, s6, 2
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
 ; SI-NEXT:    s_lshr_b32 s8, s22, 16
 ; SI-NEXT:    v_writelane_b32 v37, s7, 4
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    s_lshr_b32 s9, s24, 16
 ; SI-NEXT:    v_writelane_b32 v37, s8, 5
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 s10, s26, 16
 ; SI-NEXT:    v_writelane_b32 v37, s9, 6
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 s11, s28, 16
 ; SI-NEXT:    v_readfirstlane_b32 s31, v3
 ; SI-NEXT:    v_writelane_b32 v37, s10, 7
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s37, v5
 ; SI-NEXT:    s_lshr_b32 s12, s31, 16
 ; SI-NEXT:    v_writelane_b32 v37, s11, 8
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
 ; SI-NEXT:    v_readfirstlane_b32 s81, v18
 ; SI-NEXT:    v_readfirstlane_b32 s83, v17
 ; SI-NEXT:    v_readfirstlane_b32 s70, v16
@@ -180424,7 +180425,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s91, v2
 ; SI-NEXT:    v_readfirstlane_b32 s93, v1
 ; SI-NEXT:    v_writelane_b32 v37, s12, 9
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_mov_b32 s53, s16
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -181967,6 +181967,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
 ; SI-NEXT:    v_readlane_b32 s23, v37, 54
 ; SI-NEXT:    v_readlane_b32 s21, v36, 14
 ; SI-NEXT:    v_readlane_b32 s17, v36, 20
@@ -181974,42 +181975,41 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readlane_b32 s11, v36, 38
 ; SI-NEXT:    v_readlane_b32 s9, v37, 1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -182026,39 +182026,39 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s4, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
@@ -182979,38 +182979,38 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; VI-NEXT:    v_perm_b32 v9, v47, v9, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v56, v23, v56, s4
 ; VI-NEXT:    v_or_b32_e32 v20, v56, v20
@@ -183335,43 +183335,43 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -184236,42 +184236,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_perm_b32 v30, v30, v56, s4
 ; GFX9-NEXT:    v_perm_b32 v27, v27, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v45, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v36, v58, v52, s4
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
@@ -184572,33 +184572,33 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -184620,37 +184620,37 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -185373,47 +185373,47 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -191694,72 +191694,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
 ; SI-NEXT:    v_writelane_b32 v43, s26, 3
 ; SI-NEXT:    v_writelane_b32 v43, s25, 4
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s24, 5
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
 ; SI-NEXT:    v_writelane_b32 v43, s23, 6
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s22, 7
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s21, 8
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s20, 9
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s19, 10
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s18, 11
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s17, 12
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
 ; SI-NEXT:    v_writelane_b32 v43, s16, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v30
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    v_readfirstlane_b32 s71, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v15
@@ -191827,21 +191825,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s28, v3
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s56, v19
 ; SI-NEXT:    v_readfirstlane_b32 s47, v20
@@ -193155,6 +193155,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s16, s68, 16
 ; SI-NEXT:    s_or_b32 s4, s4, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s16
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s18
 ; SI-NEXT:    v_mov_b32_e32 v2, s19
@@ -193187,42 +193188,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -201096,61 +201096,61 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s37, v1
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
 ; SI-NEXT:    v_readfirstlane_b32 s39, v3
 ; SI-NEXT:    s_lshr_b32 s6, s37, 16
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
 ; SI-NEXT:    v_readfirstlane_b32 s49, v5
 ; SI-NEXT:    s_lshr_b32 s7, s39, 16
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_writelane_b32 v21, s6, 0
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
 ; SI-NEXT:    v_readfirstlane_b32 s51, v7
 ; SI-NEXT:    s_lshr_b32 s8, s49, 16
 ; SI-NEXT:    v_writelane_b32 v21, s7, 1
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
 ; SI-NEXT:    v_readfirstlane_b32 s53, v9
 ; SI-NEXT:    s_lshr_b32 s9, s51, 16
 ; SI-NEXT:    v_writelane_b32 v21, s8, 2
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s55, v11
 ; SI-NEXT:    s_lshr_b32 s10, s53, 16
 ; SI-NEXT:    v_writelane_b32 v21, s9, 3
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s65, v13
 ; SI-NEXT:    s_lshr_b32 s11, s55, 16
 ; SI-NEXT:    v_writelane_b32 v21, s10, 4
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s68, v18
 ; SI-NEXT:    v_readfirstlane_b32 s69, v17
 ; SI-NEXT:    v_readfirstlane_b32 s66, v16
@@ -201164,7 +201164,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s38, v4
 ; SI-NEXT:    v_readfirstlane_b32 s36, v2
 ; SI-NEXT:    v_writelane_b32 v21, s11, 5
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s99, s28, 16
 ; SI-NEXT:    s_lshr_b32 s93, s27, 16
@@ -202390,6 +202390,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v21, 62
 ; SI-NEXT:    v_readlane_b32 s17, v22, 4
 ; SI-NEXT:    v_readlane_b32 s15, v22, 10
@@ -202397,42 +202398,41 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 22
 ; SI-NEXT:    v_readlane_b32 s9, v22, 26
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -202661,38 +202661,38 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -202712,7 +202712,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -203500,39 +203500,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -203703,43 +203703,43 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -204603,42 +204603,42 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v30, v30, v56, s4
 ; GFX9-NEXT:    v_perm_b32 v27, v27, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v45, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v36, v58, v52, s4
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
@@ -204939,33 +204939,33 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -204987,37 +204987,37 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB99_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -205740,47 +205740,47 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -209277,16 +209277,18 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    v_readfirstlane_b32 s78, v7
 ; SI-NEXT:    s_and_b32 s93, s92, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
@@ -209345,14 +209347,12 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_readfirstlane_b32 s56, v14
 ; SI-NEXT:    s_and_b32 s57, s56, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s56, s56, 16
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    v_readfirstlane_b32 s58, v13
 ; SI-NEXT:    v_readfirstlane_b32 s62, v11
 ; SI-NEXT:    v_readfirstlane_b32 s74, v9
 ; SI-NEXT:    v_readfirstlane_b32 s88, v6
 ; SI-NEXT:    v_readfirstlane_b32 s90, v5
 ; SI-NEXT:    v_readfirstlane_b32 s94, v3
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -209376,13 +209376,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    s_lshl_b32 s42, s42, 16
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
 ; SI-NEXT:    s_lshl_b32 s46, s46, 16
+; SI-NEXT:    s_and_b32 s59, s58, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s58, s58, 16
+; SI-NEXT:    s_and_b32 s63, s62, 0xffff0000
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s11
-; SI-NEXT:    s_and_b32 s59, s58, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s58, s58, 16
-; SI-NEXT:    s_and_b32 s63, s62, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s62, s62, 16
 ; SI-NEXT:    s_and_b32 s75, s74, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s74, s74, 16
@@ -209437,14 +209437,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s73
 ; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s63
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s61
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s31
 ; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s59
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s31
 ; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s9
 ; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s5
 ; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s35
@@ -210452,12 +210452,12 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v58
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v12
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -210858,17 +210858,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v35, s30, 0
-; VI-NEXT:    v_writelane_b32 v35, s31, 1
-; VI-NEXT:    v_writelane_b32 v35, s36, 2
-; VI-NEXT:    v_writelane_b32 v35, s37, 3
-; VI-NEXT:    v_writelane_b32 v35, s38, 4
-; VI-NEXT:    v_writelane_b32 v35, s39, 5
-; VI-NEXT:    v_writelane_b32 v35, s48, 6
-; VI-NEXT:    v_writelane_b32 v35, s49, 7
-; VI-NEXT:    v_writelane_b32 v35, s50, 8
+; VI-NEXT:    v_writelane_b32 v35, s36, 0
+; VI-NEXT:    v_writelane_b32 v35, s37, 1
+; VI-NEXT:    v_writelane_b32 v35, s38, 2
+; VI-NEXT:    v_writelane_b32 v35, s39, 3
+; VI-NEXT:    v_writelane_b32 v35, s48, 4
+; VI-NEXT:    v_writelane_b32 v35, s49, 5
+; VI-NEXT:    v_writelane_b32 v35, s50, 6
+; VI-NEXT:    v_writelane_b32 v35, s51, 7
+; VI-NEXT:    v_writelane_b32 v35, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v35, s51, 9
+; VI-NEXT:    v_writelane_b32 v35, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -211521,16 +211521,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB101_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v35, 9
-; VI-NEXT:    v_readlane_b32 s50, v35, 8
-; VI-NEXT:    v_readlane_b32 s49, v35, 7
-; VI-NEXT:    v_readlane_b32 s48, v35, 6
-; VI-NEXT:    v_readlane_b32 s39, v35, 5
-; VI-NEXT:    v_readlane_b32 s38, v35, 4
-; VI-NEXT:    v_readlane_b32 s37, v35, 3
-; VI-NEXT:    v_readlane_b32 s36, v35, 2
-; VI-NEXT:    v_readlane_b32 s31, v35, 1
-; VI-NEXT:    v_readlane_b32 s30, v35, 0
+; VI-NEXT:    v_readlane_b32 s30, v35, 8
+; VI-NEXT:    v_readlane_b32 s31, v35, 9
+; VI-NEXT:    v_readlane_b32 s51, v35, 7
+; VI-NEXT:    v_readlane_b32 s50, v35, 6
+; VI-NEXT:    v_readlane_b32 s49, v35, 5
+; VI-NEXT:    v_readlane_b32 s48, v35, 4
+; VI-NEXT:    v_readlane_b32 s39, v35, 3
+; VI-NEXT:    v_readlane_b32 s38, v35, 2
+; VI-NEXT:    v_readlane_b32 s37, v35, 1
+; VI-NEXT:    v_readlane_b32 s36, v35, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -211543,17 +211543,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v36, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v36, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v36, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v36, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v36, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v36, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v36, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v36, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v36, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v36, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v36, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v36, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v36, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v36, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v36, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v36, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v36, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v36, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v36, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v36, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -212224,16 +212224,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB101_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v36, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v36, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v36, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v36, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v36, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v36, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v36, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v36, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v36, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v36, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v36, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v36, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v36, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v36, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v36, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v36, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v36, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v36, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v36, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v36, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -215016,55 +215016,57 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
 ; SI-NEXT:    s_lshr_b32 s6, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v62, s5, 0
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
 ; SI-NEXT:    s_lshr_b32 s7, s18, 16
 ; SI-NEXT:    v_writelane_b32 v62, s6, 1
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
 ; SI-NEXT:    s_lshr_b32 s8, s19, 16
 ; SI-NEXT:    v_writelane_b32 v62, s7, 2
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
 ; SI-NEXT:    s_lshr_b32 s9, s20, 16
 ; SI-NEXT:    v_writelane_b32 v62, s8, 3
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
 ; SI-NEXT:    s_lshr_b32 s10, s21, 16
 ; SI-NEXT:    v_writelane_b32 v62, s9, 4
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    s_lshr_b32 s11, s22, 16
 ; SI-NEXT:    v_writelane_b32 v62, s10, 5
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    s_lshr_b32 s12, s23, 16
 ; SI-NEXT:    v_readfirstlane_b32 s52, v17
 ; SI-NEXT:    v_readfirstlane_b32 s48, v16
@@ -215085,7 +215087,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_readfirstlane_b32 s46, v1
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
 ; SI-NEXT:    v_writelane_b32 v62, s11, 6
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s42, s28, 16
 ; SI-NEXT:    s_lshr_b32 s41, s27, 16
@@ -215112,7 +215113,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    s_lshr_b32 s45, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v62, s12, 7
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_writelane_b32 v62, s13, 8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -215850,42 +215850,42 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -215975,17 +215975,17 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v33, s30, 0
-; VI-NEXT:    v_writelane_b32 v33, s31, 1
-; VI-NEXT:    v_writelane_b32 v33, s36, 2
-; VI-NEXT:    v_writelane_b32 v33, s37, 3
-; VI-NEXT:    v_writelane_b32 v33, s38, 4
-; VI-NEXT:    v_writelane_b32 v33, s39, 5
-; VI-NEXT:    v_writelane_b32 v33, s48, 6
-; VI-NEXT:    v_writelane_b32 v33, s49, 7
-; VI-NEXT:    v_writelane_b32 v33, s50, 8
+; VI-NEXT:    v_writelane_b32 v33, s36, 0
+; VI-NEXT:    v_writelane_b32 v33, s37, 1
+; VI-NEXT:    v_writelane_b32 v33, s38, 2
+; VI-NEXT:    v_writelane_b32 v33, s39, 3
+; VI-NEXT:    v_writelane_b32 v33, s48, 4
+; VI-NEXT:    v_writelane_b32 v33, s49, 5
+; VI-NEXT:    v_writelane_b32 v33, s50, 6
+; VI-NEXT:    v_writelane_b32 v33, s51, 7
+; VI-NEXT:    v_writelane_b32 v33, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v33, s51, 9
+; VI-NEXT:    v_writelane_b32 v33, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -216207,16 +216207,16 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB103_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v33, 9
-; VI-NEXT:    v_readlane_b32 s50, v33, 8
-; VI-NEXT:    v_readlane_b32 s49, v33, 7
-; VI-NEXT:    v_readlane_b32 s48, v33, 6
-; VI-NEXT:    v_readlane_b32 s39, v33, 5
-; VI-NEXT:    v_readlane_b32 s38, v33, 4
-; VI-NEXT:    v_readlane_b32 s37, v33, 3
-; VI-NEXT:    v_readlane_b32 s36, v33, 2
-; VI-NEXT:    v_readlane_b32 s31, v33, 1
-; VI-NEXT:    v_readlane_b32 s30, v33, 0
+; VI-NEXT:    v_readlane_b32 s30, v33, 8
+; VI-NEXT:    v_readlane_b32 s31, v33, 9
+; VI-NEXT:    v_readlane_b32 s51, v33, 7
+; VI-NEXT:    v_readlane_b32 s50, v33, 6
+; VI-NEXT:    v_readlane_b32 s49, v33, 5
+; VI-NEXT:    v_readlane_b32 s48, v33, 4
+; VI-NEXT:    v_readlane_b32 s39, v33, 3
+; VI-NEXT:    v_readlane_b32 s38, v33, 2
+; VI-NEXT:    v_readlane_b32 s37, v33, 1
+; VI-NEXT:    v_readlane_b32 s36, v33, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -216229,17 +216229,17 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -216333,16 +216333,16 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB103_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -219831,11 +219831,13 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s14, s24, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s15, s24, 16
 ; SI-NEXT:    s_and_b32 s24, s23, 0xffff0000
@@ -219843,7 +219845,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_lshl_b32 s13, s25, 16
 ; SI-NEXT:    v_readfirstlane_b32 s42, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
@@ -219864,7 +219865,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s23
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -220836,12 +220836,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v49
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
@@ -221036,17 +221036,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v35, s30, 0
-; VI-NEXT:    v_writelane_b32 v35, s31, 1
-; VI-NEXT:    v_writelane_b32 v35, s36, 2
-; VI-NEXT:    v_writelane_b32 v35, s37, 3
-; VI-NEXT:    v_writelane_b32 v35, s38, 4
-; VI-NEXT:    v_writelane_b32 v35, s39, 5
-; VI-NEXT:    v_writelane_b32 v35, s48, 6
-; VI-NEXT:    v_writelane_b32 v35, s49, 7
-; VI-NEXT:    v_writelane_b32 v35, s50, 8
+; VI-NEXT:    v_writelane_b32 v35, s36, 0
+; VI-NEXT:    v_writelane_b32 v35, s37, 1
+; VI-NEXT:    v_writelane_b32 v35, s38, 2
+; VI-NEXT:    v_writelane_b32 v35, s39, 3
+; VI-NEXT:    v_writelane_b32 v35, s48, 4
+; VI-NEXT:    v_writelane_b32 v35, s49, 5
+; VI-NEXT:    v_writelane_b32 v35, s50, 6
+; VI-NEXT:    v_writelane_b32 v35, s51, 7
+; VI-NEXT:    v_writelane_b32 v35, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v35, s51, 9
+; VI-NEXT:    v_writelane_b32 v35, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -221699,16 +221699,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB105_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v35, 9
-; VI-NEXT:    v_readlane_b32 s50, v35, 8
-; VI-NEXT:    v_readlane_b32 s49, v35, 7
-; VI-NEXT:    v_readlane_b32 s48, v35, 6
-; VI-NEXT:    v_readlane_b32 s39, v35, 5
-; VI-NEXT:    v_readlane_b32 s38, v35, 4
-; VI-NEXT:    v_readlane_b32 s37, v35, 3
-; VI-NEXT:    v_readlane_b32 s36, v35, 2
-; VI-NEXT:    v_readlane_b32 s31, v35, 1
-; VI-NEXT:    v_readlane_b32 s30, v35, 0
+; VI-NEXT:    v_readlane_b32 s30, v35, 8
+; VI-NEXT:    v_readlane_b32 s31, v35, 9
+; VI-NEXT:    v_readlane_b32 s51, v35, 7
+; VI-NEXT:    v_readlane_b32 s50, v35, 6
+; VI-NEXT:    v_readlane_b32 s49, v35, 5
+; VI-NEXT:    v_readlane_b32 s48, v35, 4
+; VI-NEXT:    v_readlane_b32 s39, v35, 3
+; VI-NEXT:    v_readlane_b32 s38, v35, 2
+; VI-NEXT:    v_readlane_b32 s37, v35, 1
+; VI-NEXT:    v_readlane_b32 s36, v35, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -221721,17 +221721,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v36, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v36, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v36, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v36, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v36, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v36, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v36, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v36, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v36, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v36, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v36, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v36, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v36, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v36, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v36, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v36, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v36, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v36, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v36, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v36, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -222370,16 +222370,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB105_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v36, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v36, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v36, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v36, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v36, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v36, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v36, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v36, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v36, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v36, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v36, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v36, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v36, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v36, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v36, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v36, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v36, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v36, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v36, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v36, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -224668,66 +224668,69 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
 ; SI-NEXT:    s_lshr_b32 s6, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v34, s5, 0
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
 ; SI-NEXT:    s_lshr_b32 s8, s18, 16
 ; SI-NEXT:    v_writelane_b32 v34, s6, 1
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
 ; SI-NEXT:    s_lshr_b32 s90, s19, 16
 ; SI-NEXT:    v_writelane_b32 v34, s8, 2
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
 ; SI-NEXT:    s_lshr_b32 s91, s20, 16
 ; SI-NEXT:    v_writelane_b32 v34, s90, 3
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
 ; SI-NEXT:    s_lshr_b32 s92, s21, 16
 ; SI-NEXT:    v_writelane_b32 v34, s91, 4
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
 ; SI-NEXT:    s_lshr_b32 s93, s22, 16
 ; SI-NEXT:    v_writelane_b32 v34, s92, 5
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
 ; SI-NEXT:    s_lshr_b32 s94, s23, 16
 ; SI-NEXT:    v_writelane_b32 v34, s93, 6
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
 ; SI-NEXT:    s_lshr_b32 s95, s24, 16
 ; SI-NEXT:    v_writelane_b32 v34, s94, 7
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s25, 16
 ; SI-NEXT:    v_writelane_b32 v34, s95, 8
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s26, 16
 ; SI-NEXT:    v_writelane_b32 v34, vcc_lo, 9
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
 ; SI-NEXT:    v_writelane_b32 v34, vcc_hi, 10
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
 ; SI-NEXT:    s_lshr_b32 s31, s28, 16
 ; SI-NEXT:    v_writelane_b32 v34, s30, 11
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
 ; SI-NEXT:    s_lshr_b32 s34, s29, 16
 ; SI-NEXT:    v_readfirstlane_b32 s89, v17
 ; SI-NEXT:    v_readfirstlane_b32 s79, v16
@@ -224748,7 +224751,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s9, v1
 ; SI-NEXT:    v_readfirstlane_b32 s7, v0
 ; SI-NEXT:    v_writelane_b32 v34, s31, 12
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
 ; SI-NEXT:    s_lshr_b32 s88, s89, 16
 ; SI-NEXT:    s_lshr_b32 s78, s79, 16
 ; SI-NEXT:    s_lshr_b32 s76, s77, 16
@@ -224769,10 +224771,8 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_lshr_b32 s35, s7, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v34, s34, 13
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_writelane_b32 v34, s35, 14
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
 ; SI-NEXT:    v_writelane_b32 v34, s36, 15
 ; SI-NEXT:    s_cbranch_scc0 .LBB107_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -225389,43 +225389,43 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s91
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -225439,14 +225439,14 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
+; VI-NEXT:    v_writelane_b32 v32, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s8, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v15
@@ -225466,7 +225466,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s6, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
+; VI-NEXT:    v_writelane_b32 v32, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB107_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB107_3
@@ -225632,6 +225632,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    s_add_i32 s46, s46, 0x30000
 ; VI-NEXT:    s_add_i32 s47, s4, 0x30000
 ; VI-NEXT:  .LBB107_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v32, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
 ; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v2, s18
@@ -225664,14 +225665,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v29, s10
 ; VI-NEXT:    v_mov_b32_e32 v30, s9
 ; VI-NEXT:    v_mov_b32_e32 v31, s8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 7
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -225686,17 +225686,17 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -225789,16 +225789,16 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB107_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -226652,13 +226652,14 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s30, 6
+; SI-NEXT:    v_writelane_b32 v63, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s10, v17
 ; SI-NEXT:    v_readfirstlane_b32 s6, v16
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
@@ -226677,7 +226678,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s42, s27, 16
@@ -227197,14 +227197,14 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 6
+; SI-NEXT:    v_readlane_b32 s31, v63, 7
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
@@ -227308,17 +227308,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v33, s30, 0
-; VI-NEXT:    v_writelane_b32 v33, s31, 1
-; VI-NEXT:    v_writelane_b32 v33, s36, 2
-; VI-NEXT:    v_writelane_b32 v33, s37, 3
-; VI-NEXT:    v_writelane_b32 v33, s38, 4
-; VI-NEXT:    v_writelane_b32 v33, s39, 5
-; VI-NEXT:    v_writelane_b32 v33, s48, 6
-; VI-NEXT:    v_writelane_b32 v33, s49, 7
-; VI-NEXT:    v_writelane_b32 v33, s50, 8
+; VI-NEXT:    v_writelane_b32 v33, s36, 0
+; VI-NEXT:    v_writelane_b32 v33, s37, 1
+; VI-NEXT:    v_writelane_b32 v33, s38, 2
+; VI-NEXT:    v_writelane_b32 v33, s39, 3
+; VI-NEXT:    v_writelane_b32 v33, s48, 4
+; VI-NEXT:    v_writelane_b32 v33, s49, 5
+; VI-NEXT:    v_writelane_b32 v33, s50, 6
+; VI-NEXT:    v_writelane_b32 v33, s51, 7
+; VI-NEXT:    v_writelane_b32 v33, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v33, s51, 9
+; VI-NEXT:    v_writelane_b32 v33, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -227540,16 +227540,16 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB109_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v33, 9
-; VI-NEXT:    v_readlane_b32 s50, v33, 8
-; VI-NEXT:    v_readlane_b32 s49, v33, 7
-; VI-NEXT:    v_readlane_b32 s48, v33, 6
-; VI-NEXT:    v_readlane_b32 s39, v33, 5
-; VI-NEXT:    v_readlane_b32 s38, v33, 4
-; VI-NEXT:    v_readlane_b32 s37, v33, 3
-; VI-NEXT:    v_readlane_b32 s36, v33, 2
-; VI-NEXT:    v_readlane_b32 s31, v33, 1
-; VI-NEXT:    v_readlane_b32 s30, v33, 0
+; VI-NEXT:    v_readlane_b32 s30, v33, 8
+; VI-NEXT:    v_readlane_b32 s31, v33, 9
+; VI-NEXT:    v_readlane_b32 s51, v33, 7
+; VI-NEXT:    v_readlane_b32 s50, v33, 6
+; VI-NEXT:    v_readlane_b32 s49, v33, 5
+; VI-NEXT:    v_readlane_b32 s48, v33, 4
+; VI-NEXT:    v_readlane_b32 s39, v33, 3
+; VI-NEXT:    v_readlane_b32 s38, v33, 2
+; VI-NEXT:    v_readlane_b32 s37, v33, 1
+; VI-NEXT:    v_readlane_b32 s36, v33, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -227562,17 +227562,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -227666,16 +227666,16 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB109_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -229002,90 +229002,91 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
 ; SI-NEXT:    s_lshr_b32 s4, s27, 16
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s4, 0
 ; SI-NEXT:    s_lshr_b32 s4, s25, 16
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
 ; SI-NEXT:    v_writelane_b32 v33, s4, 1
 ; SI-NEXT:    s_lshr_b32 s4, s24, 16
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
 ; SI-NEXT:    v_writelane_b32 v33, s4, 2
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
 ; SI-NEXT:    v_writelane_b32 v33, s23, 3
 ; SI-NEXT:    s_lshr_b32 s4, s23, 16
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
 ; SI-NEXT:    v_writelane_b32 v33, s4, 4
 ; SI-NEXT:    s_lshr_b32 s4, s21, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
 ; SI-NEXT:    v_writelane_b32 v33, s4, 5
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
 ; SI-NEXT:    v_writelane_b32 v33, s19, 6
 ; SI-NEXT:    s_lshr_b32 s4, s19, 16
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
 ; SI-NEXT:    v_writelane_b32 v33, s4, 7
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
 ; SI-NEXT:    v_writelane_b32 v33, s17, 8
 ; SI-NEXT:    s_lshr_b32 s4, s17, 16
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
 ; SI-NEXT:    v_writelane_b32 v33, s4, 9
 ; SI-NEXT:    s_lshr_b32 s4, s16, 16
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
 ; SI-NEXT:    s_mov_b32 s46, s16
 ; SI-NEXT:    v_writelane_b32 v33, s4, 10
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    s_mov_b32 s47, s18
 ; SI-NEXT:    v_writelane_b32 v33, s46, 11
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_mov_b32 s57, s20
 ; SI-NEXT:    v_writelane_b32 v33, s47, 12
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    s_mov_b32 s31, s22
 ; SI-NEXT:    v_writelane_b32 v33, s57, 13
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
 ; SI-NEXT:    s_mov_b32 s35, s24
 ; SI-NEXT:    v_writelane_b32 v33, s31, 14
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
 ; SI-NEXT:    s_mov_b32 s69, s26
 ; SI-NEXT:    v_writelane_b32 v33, s35, 15
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
 ; SI-NEXT:    s_mov_b32 s78, s28
 ; SI-NEXT:    v_writelane_b32 v33, s69, 16
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
 ; SI-NEXT:    v_readfirstlane_b32 s77, v10
 ; SI-NEXT:    v_readfirstlane_b32 s45, v0
 ; SI-NEXT:    v_writelane_b32 v33, s78, 17
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
 ; SI-NEXT:    s_lshr_b32 s34, s18, 16
 ; SI-NEXT:    s_lshr_b32 s18, s77, 16
 ; SI-NEXT:    v_writelane_b32 v33, s45, 18
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
 ; SI-NEXT:    v_readfirstlane_b32 s19, v13
 ; SI-NEXT:    v_readfirstlane_b32 s23, v12
 ; SI-NEXT:    v_writelane_b32 v33, s18, 19
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
 ; SI-NEXT:    s_lshr_b32 s36, s20, 16
 ; SI-NEXT:    s_lshr_b32 s20, s23, 16
 ; SI-NEXT:    v_writelane_b32 v33, s19, 20
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s58, v15
 ; SI-NEXT:    v_readfirstlane_b32 s88, v14
 ; SI-NEXT:    v_writelane_b32 v33, s20, 21
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
 ; SI-NEXT:    s_mov_b32 s39, s27
 ; SI-NEXT:    v_readfirstlane_b32 s60, v17
 ; SI-NEXT:    v_readfirstlane_b32 s92, v16
@@ -229101,7 +229102,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s71, v2
 ; SI-NEXT:    v_readfirstlane_b32 s68, v1
 ; SI-NEXT:    v_writelane_b32 v33, s58, 22
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s93, s29, 16
 ; SI-NEXT:    s_lshr_b32 s85, s28, 16
 ; SI-NEXT:    s_lshr_b32 s84, s26, 16
@@ -229679,6 +229679,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s46, s99, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_readlane_b32 s47, v33, 33
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
@@ -229712,42 +229713,41 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -229803,14 +229803,14 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
+; VI-NEXT:    v_writelane_b32 v32, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s8, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v15
@@ -229830,7 +229830,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s6, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
+; VI-NEXT:    v_writelane_b32 v32, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB111_3
@@ -229996,6 +229996,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    s_add_i32 s46, s46, 0x30000
 ; VI-NEXT:    s_add_i32 s47, s4, 0x30000
 ; VI-NEXT:  .LBB111_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v32, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
 ; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v2, s18
@@ -230028,14 +230029,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v29, s10
 ; VI-NEXT:    v_mov_b32_e32 v30, s9
 ; VI-NEXT:    v_mov_b32_e32 v31, s8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 7
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -230050,17 +230050,17 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -230153,16 +230153,16 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB111_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 94f4be1c32418..efff1fb94ffa9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) {
+define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -190,7 +190,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) {
+define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -270,7 +270,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,7 +375,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) {
+define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -457,7 +457,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -559,7 +559,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) {
+define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,7 +642,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -744,7 +744,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) {
+define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) {
+define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1005,7 +1005,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) {
+define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1210,7 +1210,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1336,7 +1336,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) {
+define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1477,7 +1477,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1629,7 +1629,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) {
+define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1737,7 +1737,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,7 +1863,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) {
+define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2021,7 +2021,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2189,7 +2189,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,7 +2318,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2456,7 +2456,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
+define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2880,7 +2880,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3349,7 +3349,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) {
+define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3675,7 +3675,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3963,7 +3963,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
+define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4488,7 +4488,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4851,7 +4851,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) {
+define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4931,7 +4931,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5036,7 +5036,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) {
+define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5119,7 +5119,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5221,7 +5221,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) {
+define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5301,7 +5301,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5406,7 +5406,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) {
+define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5483,7 +5483,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5580,7 +5580,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) {
+define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5686,7 +5686,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5820,7 +5820,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) {
+define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5961,7 +5961,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6113,7 +6113,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) {
+define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6219,7 +6219,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6353,7 +6353,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) {
+define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6511,7 +6511,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6679,7 +6679,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6806,7 +6806,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6956,7 +6956,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
+define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7380,7 +7380,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7849,7 +7849,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) {
+define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8171,7 +8171,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8488,7 +8488,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
+define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9013,7 +9013,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9376,7 +9376,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) {
+define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9459,7 +9459,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9560,7 +9560,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) {
+define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9637,7 +9637,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9734,7 +9734,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) {
+define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9843,7 +9843,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9969,7 +9969,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) {
+define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10110,7 +10110,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10262,7 +10262,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) {
+define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10371,7 +10371,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10497,7 +10497,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) {
+define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10655,7 +10655,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10823,7 +10823,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10953,7 +10953,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11091,7 +11091,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
+define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11515,7 +11515,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11984,7 +11984,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) {
+define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12310,7 +12310,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12598,7 +12598,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
+define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13123,7 +13123,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13486,7 +13486,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) {
+define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13589,7 +13589,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13715,7 +13715,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) {
+define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13856,7 +13856,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14008,7 +14008,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) {
+define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14111,7 +14111,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14237,7 +14237,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) {
+define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14395,7 +14395,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14563,7 +14563,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14685,7 +14685,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14827,7 +14827,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
+define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15251,7 +15251,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15720,7 +15720,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) {
+define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16038,7 +16038,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16346,7 +16346,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
+define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16871,7 +16871,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17234,7 +17234,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) {
+define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17391,7 +17391,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17566,7 +17566,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) {
+define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17706,7 +17706,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17880,7 +17880,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18032,7 +18032,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18207,7 +18207,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
+define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18641,7 +18641,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19114,7 +19114,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
+define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19494,7 +19494,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19843,7 +19843,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
+define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20389,7 +20389,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20779,7 +20779,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20947,7 +20947,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21139,7 +21139,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
+define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21586,7 +21586,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22084,7 +22084,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) {
+define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22468,7 +22468,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22837,7 +22837,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
+define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23383,7 +23383,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23773,7 +23773,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
+define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24432,7 +24432,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25138,7 +25138,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25686,7 +25686,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26071,3 +26071,5 @@ end:
   %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <8 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
index 430a93d9e9bf0..c09389ef700ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) {
+define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,7 +202,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) {
+define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) {
+define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -519,7 +519,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -659,7 +659,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) {
+define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -817,7 +817,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -990,7 +990,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) {
+define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1109,7 +1109,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1249,7 +1249,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) {
+define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1427,7 +1427,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1619,7 +1619,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) {
+define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1736,7 +1736,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,7 +1893,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) {
+define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2051,7 +2051,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) {
+define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2341,7 +2341,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2498,7 +2498,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) {
+define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2676,7 +2676,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2868,7 +2868,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) {
+define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3045,7 +3045,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3250,7 +3250,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) {
+define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3407,7 +3407,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3612,6 +3612,9 @@ end:
   %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <10 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index b6b321a08f7aa..73d5bfa812b08 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
+define half @bitcast_i16_to_f16(i16 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i16_to_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@ end:
   ret half %phi
 }
 
-define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i16_to_f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +181,7 @@ end:
   ret half %phi
 }
 
-define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
+define i16 @bitcast_f16_to_i16(half %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f16_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -287,7 +287,7 @@ end:
   ret i16 %phi
 }
 
-define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f16_to_i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -394,7 +394,7 @@ end:
   ret i16 %phi
 }
 
-define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
+define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i16_to_bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -490,7 +490,7 @@ end:
   ret bfloat %phi
 }
 
-define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i16_to_bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -576,7 +576,7 @@ end:
   ret bfloat %phi
 }
 
-define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
+define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -720,7 +720,7 @@ end:
   ret i16 %phi
 }
 
-define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -836,7 +836,7 @@ end:
   ret i16 %phi
 }
 
-define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
+define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f16_to_bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -939,7 +939,7 @@ end:
   ret bfloat %phi
 }
 
-define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f16_to_bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1050,7 +1050,7 @@ end:
   ret bfloat %phi
 }
 
-define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
+define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1194,7 +1194,7 @@ end:
   ret half %phi
 }
 
-define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1309,3 +1309,5 @@ end:
   %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret half %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
index d463b115d1088..647b212d4d0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) {
+define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,7 +96,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -213,7 +213,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) {
+define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,7 +300,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -422,7 +422,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) {
+define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -512,7 +512,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -629,7 +629,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) {
+define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -721,7 +721,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -838,7 +838,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) {
+define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1045,7 +1045,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) {
+define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1126,7 +1126,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1236,7 +1236,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) {
+define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1518,7 +1518,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) {
+define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1692,7 +1692,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1885,7 +1885,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) {
+define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2167,7 +2167,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) {
+define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2365,7 +2365,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2580,7 +2580,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) {
+define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2667,7 +2667,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,7 +2789,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) {
+define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2881,7 +2881,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2998,7 +2998,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) {
+define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3085,7 +3085,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3207,7 +3207,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) {
+define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3288,7 +3288,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3398,7 +3398,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) {
+define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3524,7 +3524,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3694,7 +3694,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) {
+define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3868,7 +3868,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4061,7 +4061,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) {
+define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4187,7 +4187,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4357,7 +4357,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) {
+define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4770,7 +4770,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) {
+define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4862,7 +4862,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4978,7 +4978,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) {
+define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5059,7 +5059,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5169,7 +5169,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) {
+define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5300,7 +5300,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5453,7 +5453,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) {
+define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5627,7 +5627,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5820,7 +5820,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) {
+define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5951,7 +5951,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6104,7 +6104,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) {
+define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6302,7 +6302,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6517,7 +6517,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) {
+define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6637,7 +6637,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6795,7 +6795,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) {
+define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6969,7 +6969,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7162,7 +7162,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) {
+define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7282,7 +7282,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7440,7 +7440,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) {
+define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7638,7 +7638,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7853,7 +7853,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) {
+define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8052,7 +8052,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8283,7 +8283,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) {
+define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8458,7 +8458,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8686,6 +8686,9 @@ end:
   %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <12 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
index e0fac42ac9d77..c66f5ea8e6c60 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) {
+define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,7 +101,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -226,7 +226,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) {
+define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,7 +317,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -448,7 +448,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) {
+define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -588,7 +588,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -755,7 +755,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) {
+define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -946,7 +946,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,7 +1160,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) {
+define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1300,7 +1300,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1467,7 +1467,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) {
+define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1686,7 +1686,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1925,7 +1925,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) {
+define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2061,7 +2061,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2244,7 +2244,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) {
+define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2435,7 +2435,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2649,7 +2649,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) {
+define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2785,7 +2785,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2968,7 +2968,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) {
+define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3187,7 +3187,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3426,7 +3426,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) {
+define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3646,7 +3646,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3901,7 +3901,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) {
+define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4094,7 +4094,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4344,6 +4344,9 @@ end:
   %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <14 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index d917cb0d623bc..cfc27af41e582 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) {
+define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -237,7 +237,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) {
+define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -331,7 +331,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,7 +470,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) {
+define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -569,7 +569,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -701,7 +701,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) {
+define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,7 +802,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -934,7 +934,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) {
+define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1033,7 +1033,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1165,7 +1165,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) {
+define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1250,7 +1250,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1373,7 +1373,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) {
+define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1523,7 +1523,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,7 +1703,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) {
+define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1910,7 +1910,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2144,7 +2144,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) {
+define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2294,7 +2294,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2474,7 +2474,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) {
+define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2713,7 +2713,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2975,7 +2975,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3166,7 +3166,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3370,7 +3370,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
+define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4114,7 +4114,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4966,7 +4966,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) {
+define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5511,7 +5511,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6015,7 +6015,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
+define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6959,7 +6959,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7650,7 +7650,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) {
+define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7744,7 +7744,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7883,7 +7883,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) {
+define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7984,7 +7984,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8116,7 +8116,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) {
+define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8210,7 +8210,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8349,7 +8349,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) {
+define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8434,7 +8434,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8557,7 +8557,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) {
+define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8702,7 +8702,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8898,7 +8898,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) {
+define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9105,7 +9105,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9339,7 +9339,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) {
+define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9484,7 +9484,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9680,7 +9680,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) {
+define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9919,7 +9919,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10181,7 +10181,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10367,7 +10367,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10595,7 +10595,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
+define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11339,7 +11339,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12191,7 +12191,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) {
+define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12732,7 +12732,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13283,7 +13283,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
+define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14227,7 +14227,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14918,7 +14918,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) {
+define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15019,7 +15019,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15150,7 +15150,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) {
+define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15235,7 +15235,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15358,7 +15358,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) {
+define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15510,7 +15510,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15690,7 +15690,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) {
+define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15897,7 +15897,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16131,7 +16131,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) {
+define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16283,7 +16283,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16463,7 +16463,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) {
+define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16702,7 +16702,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16964,7 +16964,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17157,7 +17157,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17361,7 +17361,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
+define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18105,7 +18105,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18957,7 +18957,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) {
+define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19508,7 +19508,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20012,7 +20012,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
+define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20956,7 +20956,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21647,7 +21647,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) {
+define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21783,7 +21783,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21963,7 +21963,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) {
+define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22170,7 +22170,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22404,7 +22404,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) {
+define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22540,7 +22540,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22720,7 +22720,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) {
+define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22959,7 +22959,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23221,7 +23221,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23394,7 +23394,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23606,7 +23606,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
+define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24350,7 +24350,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25202,7 +25202,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) {
+define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25737,7 +25737,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26274,7 +26274,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
+define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27218,7 +27218,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27909,7 +27909,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) {
+define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28151,7 +28151,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28432,7 +28432,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) {
+define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28643,7 +28643,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28917,7 +28917,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29150,7 +29150,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29431,7 +29431,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
+define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30212,7 +30212,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31054,7 +31054,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
+define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31702,7 +31702,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32321,7 +32321,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
+define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33316,7 +33316,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34208,7 +34208,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34474,7 +34474,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34784,7 +34784,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) {
+define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35583,7 +35583,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36472,7 +36472,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) {
+define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37127,7 +37127,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37780,7 +37780,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
+define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38775,7 +38775,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39667,7 +39667,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
+define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40848,7 +40848,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42182,7 +42182,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43181,7 +43181,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44064,3 +44064,5 @@ end:
   %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <16 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
index 6656733d53e51..ead5d76b2e572 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) {
+define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,7 +249,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) {
+define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -347,7 +347,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,7 +494,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) {
+define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,7 +654,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -848,7 +848,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) {
+define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,7 +1071,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1325,7 +1325,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) {
+define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1485,7 +1485,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1679,7 +1679,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) {
+define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1939,7 +1939,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) {
+define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2379,7 +2379,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2607,7 +2607,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) {
+define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2830,7 +2830,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3084,7 +3084,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) {
+define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3239,7 +3239,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3467,7 +3467,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) {
+define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3727,7 +3727,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4012,7 +4012,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) {
+define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4274,7 +4274,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4590,7 +4590,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) {
+define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4819,7 +4819,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5133,6 +5133,9 @@ end:
   %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <18 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9ae6700ac1825..442767fc1162d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) {
+define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -263,7 +263,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) {
+define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -367,7 +367,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -521,7 +521,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) {
+define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,7 +694,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -901,7 +901,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) {
+define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1141,7 +1141,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1414,7 +1414,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
+define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1587,7 +1587,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1794,7 +1794,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
+define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2074,7 +2074,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2381,7 +2381,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
+define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3251,7 +3251,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4049,7 +4049,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
+define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5247,7 +5247,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6104,7 +6104,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
+define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6214,7 +6214,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6361,7 +6361,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
+define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6451,7 +6451,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6585,7 +6585,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
+define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6695,7 +6695,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6842,7 +6842,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
+define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6955,7 +6955,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7102,7 +7102,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
+define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7269,7 +7269,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7509,7 +7509,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) {
+define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7749,7 +7749,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8022,7 +8022,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) {
+define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8189,7 +8189,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8429,7 +8429,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) {
+define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8709,7 +8709,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9016,7 +9016,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
+define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9878,7 +9878,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10769,7 +10769,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
+define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11967,7 +11967,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12824,7 +12824,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) {
+define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12928,7 +12928,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13103,7 +13103,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) {
+define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13193,7 +13193,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13327,7 +13327,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) {
+define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13431,7 +13431,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13606,7 +13606,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) {
+define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13719,7 +13719,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13866,7 +13866,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) {
+define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14151,7 +14151,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14492,7 +14492,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) {
+define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14739,7 +14739,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15076,7 +15076,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
+define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16111,7 +16111,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16119,10 +16119,10 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s30, 0
-; SI-NEXT:    v_writelane_b32 v3, s31, 1
-; SI-NEXT:    v_writelane_b32 v3, s34, 2
-; SI-NEXT:    v_writelane_b32 v3, s35, 3
+; SI-NEXT:    v_writelane_b32 v3, s34, 0
+; SI-NEXT:    v_writelane_b32 v3, s35, 1
+; SI-NEXT:    v_writelane_b32 v3, s30, 2
+; SI-NEXT:    v_writelane_b32 v3, s31, 3
 ; SI-NEXT:    s_lshr_b32 s90, s25, 16
 ; SI-NEXT:    s_lshr_b32 s35, s24, 16
 ; SI-NEXT:    s_lshr_b32 s91, s23, 16
@@ -16400,11 +16400,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v3, 2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s35, v3, 3
-; SI-NEXT:    v_readlane_b32 s34, v3, 2
-; SI-NEXT:    v_readlane_b32 s31, v3, 1
-; SI-NEXT:    v_readlane_b32 s30, v3, 0
+; SI-NEXT:    v_readlane_b32 s31, v3, 3
+; SI-NEXT:    v_readlane_b32 s35, v3, 1
+; SI-NEXT:    v_readlane_b32 s34, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17082,7 +17082,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
+define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18418,7 +18418,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18426,14 +18426,15 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v27, s30, 0
-; SI-NEXT:    v_writelane_b32 v27, s31, 1
-; SI-NEXT:    v_writelane_b32 v27, s34, 2
-; SI-NEXT:    v_writelane_b32 v27, s35, 3
-; SI-NEXT:    v_writelane_b32 v27, s36, 4
-; SI-NEXT:    v_writelane_b32 v27, s37, 5
+; SI-NEXT:    v_writelane_b32 v27, s34, 0
+; SI-NEXT:    v_writelane_b32 v27, s35, 1
+; SI-NEXT:    v_writelane_b32 v27, s36, 2
+; SI-NEXT:    v_writelane_b32 v27, s37, 3
+; SI-NEXT:    v_writelane_b32 v27, s38, 4
+; SI-NEXT:    v_writelane_b32 v27, s39, 5
+; SI-NEXT:    v_writelane_b32 v27, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v27, s38, 6
+; SI-NEXT:    v_writelane_b32 v27, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s90, v25
 ; SI-NEXT:    v_readfirstlane_b32 s91, v24
 ; SI-NEXT:    v_readfirstlane_b32 s93, v23
@@ -18461,7 +18462,6 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_readfirstlane_b32 s58, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s59, v0
-; SI-NEXT:    v_writelane_b32 v27, s39, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -18742,6 +18742,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s14, s15, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v27, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -18752,14 +18753,13 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v7, s7
 ; SI-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-NEXT:    v_mov_b32_e32 v9, s5
-; SI-NEXT:    v_readlane_b32 s39, v27, 7
-; SI-NEXT:    v_readlane_b32 s38, v27, 6
-; SI-NEXT:    v_readlane_b32 s37, v27, 5
-; SI-NEXT:    v_readlane_b32 s36, v27, 4
-; SI-NEXT:    v_readlane_b32 s35, v27, 3
-; SI-NEXT:    v_readlane_b32 s34, v27, 2
-; SI-NEXT:    v_readlane_b32 s31, v27, 1
-; SI-NEXT:    v_readlane_b32 s30, v27, 0
+; SI-NEXT:    v_readlane_b32 s31, v27, 7
+; SI-NEXT:    v_readlane_b32 s39, v27, 5
+; SI-NEXT:    v_readlane_b32 s38, v27, 4
+; SI-NEXT:    v_readlane_b32 s37, v27, 3
+; SI-NEXT:    v_readlane_b32 s36, v27, 2
+; SI-NEXT:    v_readlane_b32 s35, v27, 1
+; SI-NEXT:    v_readlane_b32 s34, v27, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19524,7 +19524,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) {
+define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19764,7 +19764,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20070,7 +20070,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
+define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20223,7 +20223,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20443,7 +20443,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) {
+define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20683,7 +20683,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20989,7 +20989,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
+define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21165,7 +21165,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21372,7 +21372,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
+define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22391,7 +22391,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22399,10 +22399,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v12, s30, 0
-; SI-NEXT:    v_writelane_b32 v12, s31, 1
-; SI-NEXT:    v_writelane_b32 v12, s34, 2
-; SI-NEXT:    v_writelane_b32 v12, s35, 3
+; SI-NEXT:    v_writelane_b32 v12, s34, 0
+; SI-NEXT:    v_writelane_b32 v12, s35, 1
+; SI-NEXT:    v_writelane_b32 v12, s30, 2
+; SI-NEXT:    v_writelane_b32 v12, s31, 3
 ; SI-NEXT:    s_lshr_b32 s34, s25, 16
 ; SI-NEXT:    s_lshr_b32 s35, s24, 16
 ; SI-NEXT:    s_lshr_b32 s30, s23, 16
@@ -22760,11 +22760,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; SI-NEXT:    v_readlane_b32 s30, v12, 2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s35, v12, 3
-; SI-NEXT:    v_readlane_b32 s34, v12, 2
-; SI-NEXT:    v_readlane_b32 s31, v12, 1
-; SI-NEXT:    v_readlane_b32 s30, v12, 0
+; SI-NEXT:    v_readlane_b32 s31, v12, 3
+; SI-NEXT:    v_readlane_b32 s35, v12, 1
+; SI-NEXT:    v_readlane_b32 s34, v12, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23424,7 +23424,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
+define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24760,7 +24760,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24768,14 +24768,15 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v27, s30, 0
-; SI-NEXT:    v_writelane_b32 v27, s31, 1
-; SI-NEXT:    v_writelane_b32 v27, s34, 2
-; SI-NEXT:    v_writelane_b32 v27, s35, 3
-; SI-NEXT:    v_writelane_b32 v27, s36, 4
-; SI-NEXT:    v_writelane_b32 v27, s37, 5
+; SI-NEXT:    v_writelane_b32 v27, s34, 0
+; SI-NEXT:    v_writelane_b32 v27, s35, 1
+; SI-NEXT:    v_writelane_b32 v27, s36, 2
+; SI-NEXT:    v_writelane_b32 v27, s37, 3
+; SI-NEXT:    v_writelane_b32 v27, s38, 4
+; SI-NEXT:    v_writelane_b32 v27, s39, 5
+; SI-NEXT:    v_writelane_b32 v27, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v27, s38, 6
+; SI-NEXT:    v_writelane_b32 v27, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s90, v25
 ; SI-NEXT:    v_readfirstlane_b32 s91, v24
 ; SI-NEXT:    v_readfirstlane_b32 s93, v23
@@ -24803,7 +24804,6 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_readfirstlane_b32 s58, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s59, v0
-; SI-NEXT:    v_writelane_b32 v27, s39, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB63_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -25084,6 +25084,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s14, s15, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v27, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -25094,14 +25095,13 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v7, s7
 ; SI-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-NEXT:    v_mov_b32_e32 v9, s5
-; SI-NEXT:    v_readlane_b32 s39, v27, 7
-; SI-NEXT:    v_readlane_b32 s38, v27, 6
-; SI-NEXT:    v_readlane_b32 s37, v27, 5
-; SI-NEXT:    v_readlane_b32 s36, v27, 4
-; SI-NEXT:    v_readlane_b32 s35, v27, 3
-; SI-NEXT:    v_readlane_b32 s34, v27, 2
-; SI-NEXT:    v_readlane_b32 s31, v27, 1
-; SI-NEXT:    v_readlane_b32 s30, v27, 0
+; SI-NEXT:    v_readlane_b32 s31, v27, 7
+; SI-NEXT:    v_readlane_b32 s39, v27, 5
+; SI-NEXT:    v_readlane_b32 s38, v27, 4
+; SI-NEXT:    v_readlane_b32 s37, v27, 3
+; SI-NEXT:    v_readlane_b32 s36, v27, 2
+; SI-NEXT:    v_readlane_b32 s35, v27, 1
+; SI-NEXT:    v_readlane_b32 s34, v27, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25866,7 +25866,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
+define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26146,7 +26146,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26499,7 +26499,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
+define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26652,7 +26652,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26872,7 +26872,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
+define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27152,7 +27152,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27505,7 +27505,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
+define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27681,7 +27681,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27888,7 +27888,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
+define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29140,7 +29140,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30021,7 +30021,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
+define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30866,7 +30866,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31732,7 +31732,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
+define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32984,7 +32984,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33865,7 +33865,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
+define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34741,7 +34741,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35539,7 +35539,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
+define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35629,7 +35629,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35784,7 +35784,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
+define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35897,7 +35897,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36042,3 +36042,5 @@ end:
   %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <5 x double> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 0a8af1ab3e547..123d1042e27c9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define float @bitcast_i32_to_f32(i32 %a, i32 %b) {
+define float @bitcast_i32_to_f32(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,7 +76,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -157,7 +157,7 @@ end:
   ret float %phi
 }
 
-define i32 @bitcast_f32_to_i32(float %a, i32 %b) {
+define i32 @bitcast_f32_to_i32(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) {
+define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -388,7 +388,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +475,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) {
+define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,7 +567,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -661,7 +661,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) {
+define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -739,7 +739,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
   ret <2 x half> %phi
 }
 
-define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) {
+define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -918,7 +918,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1019,7 +1019,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) {
+define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,7 +1192,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
+define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1376,7 +1376,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
   ret i32 %phi
 }
 
-define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) {
+define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1641,7 +1641,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1722,7 +1722,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) {
+define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1792,7 +1792,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1873,7 +1873,7 @@ end:
   ret i32 %phi
 }
 
-define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) {
+define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2039,7 +2039,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2166,7 +2166,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
+define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2401,7 +2401,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2555,7 +2555,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) {
+define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2724,7 +2724,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) {
+define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2816,7 +2816,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2910,7 +2910,7 @@ end:
   ret float %phi
 }
 
-define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) {
+define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2988,7 +2988,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3079,7 +3079,7 @@ end:
   ret <2 x half> %phi
 }
 
-define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) {
+define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3171,7 +3171,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3272,7 +3272,7 @@ end:
   ret float %phi
 }
 
-define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) {
+define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3355,7 +3355,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
+define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3634,7 +3634,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3829,7 +3829,7 @@ end:
   ret float %phi
 }
 
-define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) {
+define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3899,7 +3899,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3982,7 +3982,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) {
+define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4133,7 +4133,7 @@ end:
   ret float %phi
 }
 
-define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) {
+define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4299,7 +4299,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4429,7 +4429,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
+define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4664,7 +4664,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4818,7 +4818,7 @@ end:
   ret float %phi
 }
 
-define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) {
+define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4907,7 +4907,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5005,7 +5005,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) {
+define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5091,7 +5091,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5193,7 +5193,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5274,7 +5274,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5370,7 +5370,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
+define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5545,7 +5545,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5737,7 +5737,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) {
+define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5829,7 +5829,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5923,7 +5923,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) {
+define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6001,7 +6001,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
+define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6269,7 +6269,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6410,7 +6410,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
+define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6645,7 +6645,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6805,7 +6805,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6900,7 +6900,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7007,7 +7007,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
+define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7189,7 +7189,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7387,7 +7387,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) {
+define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7479,7 +7479,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7580,7 +7580,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) {
+define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7658,7 +7658,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7745,7 +7745,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) {
+define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7925,7 +7925,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8069,7 +8069,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
+define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8304,7 +8304,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,7 +8464,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
+define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8648,7 +8648,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8843,7 +8843,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8926,7 +8926,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9016,7 +9016,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
+define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9267,7 +9267,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9515,7 +9515,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9747,7 +9747,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9903,7 +9903,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) {
+define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10069,7 +10069,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10196,7 +10196,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
+define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10431,7 +10431,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10584,3 +10584,5 @@ end:
   %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <1 x i32> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
index 70ed2ca42b706..79c9fc7faf339 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) {
+define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -275,7 +275,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) {
+define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -383,7 +383,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -545,7 +545,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) {
+define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,7 +949,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) {
+define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1205,7 +1205,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1498,7 +1498,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) {
+define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1681,7 +1681,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1902,7 +1902,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) {
+define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2202,7 +2202,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2532,7 +2532,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) {
+define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2709,7 +2709,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2961,7 +2961,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) {
+define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3217,7 +3217,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3510,7 +3510,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) {
+define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3687,7 +3687,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3939,7 +3939,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) {
+define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4239,7 +4239,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4569,7 +4569,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) {
+define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4874,7 +4874,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5238,7 +5238,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) {
+define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5502,7 +5502,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5859,6 +5859,9 @@ end:
   %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <22 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
index 60c5431f7e4c6..cb0e72323a165 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) {
+define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -124,7 +124,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) {
+define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -397,7 +397,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -566,7 +566,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) {
+define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -684,7 +684,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,7 +846,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) {
+define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -940,7 +940,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1085,7 +1085,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) {
+define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1203,7 +1203,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) {
+define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1486,7 +1486,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1648,7 +1648,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) {
+define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1841,7 +1841,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2075,7 +2075,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) {
+define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2347,7 +2347,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2659,7 +2659,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) {
+define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2852,7 +2852,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3086,7 +3086,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) {
+define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3406,7 +3406,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3758,7 +3758,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) {
+define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3869,7 +3869,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) {
+define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4146,7 +4146,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4291,7 +4291,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) {
+define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4402,7 +4402,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4585,7 +4585,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) {
+define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4706,7 +4706,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4868,7 +4868,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) {
+define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5054,7 +5054,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5318,7 +5318,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) {
+define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5590,7 +5590,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5902,7 +5902,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) {
+define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6352,7 +6352,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) {
+define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6672,7 +6672,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7024,7 +7024,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) {
+define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7118,7 +7118,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7277,7 +7277,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) {
+define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7398,7 +7398,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7559,7 +7559,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) {
+define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7728,7 +7728,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7968,7 +7968,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) {
+define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8240,7 +8240,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8582,7 +8582,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) {
+define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8751,7 +8751,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8991,7 +8991,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) {
+define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9311,7 +9311,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9702,7 +9702,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) {
+define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9898,7 +9898,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10132,7 +10132,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) {
+define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10404,7 +10404,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10746,7 +10746,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) {
+define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10942,7 +10942,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11176,7 +11176,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) {
+define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11496,7 +11496,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11887,7 +11887,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) {
+define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12214,7 +12214,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12603,7 +12603,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) {
+define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12885,7 +12885,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13264,6 +13264,9 @@ end:
   %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <24 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
index cd47411c7e51c..f7b7547cad1bd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) {
+define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -312,7 +312,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) {
+define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -431,7 +431,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) {
+define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -751,7 +751,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -931,7 +931,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) {
+define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1061,7 +1061,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1241,7 +1241,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) {
+define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,7 +1367,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1547,7 +1547,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) {
+define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1645,7 +1645,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1811,7 +1811,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) {
+define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2024,7 +2024,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2288,7 +2288,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) {
+define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2592,7 +2592,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2973,7 +2973,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) {
+define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3186,7 +3186,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) {
+define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3810,7 +3810,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4242,7 +4242,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) {
+define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4361,7 +4361,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) {
+define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4685,7 +4685,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4865,7 +4865,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) {
+define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4984,7 +4984,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5178,7 +5178,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) {
+define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5276,7 +5276,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5442,7 +5442,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) {
+define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5648,7 +5648,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5939,7 +5939,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) {
+define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6243,7 +6243,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6624,7 +6624,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) {
+define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6830,7 +6830,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7121,7 +7121,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) {
+define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7481,7 +7481,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7913,7 +7913,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) {
+define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8043,7 +8043,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8222,7 +8222,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) {
+define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8320,7 +8320,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8486,7 +8486,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) {
+define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8703,7 +8703,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8967,7 +8967,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) {
+define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9271,7 +9271,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9652,7 +9652,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) {
+define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9869,7 +9869,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10133,7 +10133,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) {
+define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10493,7 +10493,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10925,7 +10925,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) {
+define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11110,7 +11110,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11373,7 +11373,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) {
+define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11677,7 +11677,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12058,7 +12058,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) {
+define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12243,7 +12243,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12506,7 +12506,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) {
+define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12866,7 +12866,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13298,7 +13298,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) {
+define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13688,7 +13688,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14128,7 +14128,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) {
+define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14445,7 +14445,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14870,6 +14870,8 @@ end:
   %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <28 x i16> %phi
 }
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 547985e7ef4e3..121aad5b96cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
+define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,7 +242,7 @@ end:
   ret <3 x half> %phi
 }
 
-define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,7 +500,7 @@ end:
   ret <3 x half> %phi
 }
 
-define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -608,7 +608,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -730,7 +730,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
+define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -960,7 +960,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1209,7 +1209,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1305,7 +1305,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1415,7 +1415,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) {
+define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1509,7 +1509,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1622,7 +1622,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) {
+define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1722,7 +1722,7 @@ end:
   ret <3 x half> %phi
 }
 
-define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1832,3 +1832,5 @@ end:
   %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <3 x half> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 74626bd38f582..791d1c24a6922 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) {
+define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,7 +140,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -341,7 +341,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) {
+define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -467,7 +467,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -522,8 +522,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB3_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -583,8 +583,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB3_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -644,8 +644,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB3_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -712,7 +712,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) {
+define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,7 +846,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1047,7 +1047,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) {
+define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1185,7 +1185,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1386,7 +1386,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) {
+define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1520,7 +1520,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1721,7 +1721,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) {
+define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1823,7 +1823,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1870,8 +1870,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB11_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1923,8 +1923,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB11_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1976,8 +1976,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB11_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2036,7 +2036,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) {
+define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2269,7 +2269,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2566,7 +2566,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
+define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2902,7 +2902,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3269,8 +3269,8 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB15_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -3337,7 +3337,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) {
+define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3570,7 +3570,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3867,7 +3867,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) {
+define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4267,7 +4267,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4637,8 +4637,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4699,8 +4699,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB19_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4767,7 +4767,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5080,7 +5080,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5425,7 +5425,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
+define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6831,7 +6831,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7411,8 +7411,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB23_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7762,8 +7762,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB23_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8477,7 +8477,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
+define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9944,7 +9944,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9952,40 +9952,40 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_writelane_b32 v4, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
+; SI-NEXT:    v_writelane_b32 v4, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 24
@@ -10306,37 +10306,37 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 29
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10399,30 +10399,30 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -10667,27 +10667,27 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10750,26 +10750,26 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v4, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v4, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v4, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v4, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v4, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v4, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v4, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v4, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v4, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v4, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v4, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v4, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v4, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v4, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v4, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v4, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v4, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v4, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v4, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v4, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v4, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v4, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v4, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v4, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v4, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v4, s55, 13
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v4, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v4, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_writelane_b32 v4, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v4, s31, 15
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s56, s5, 24
@@ -10999,23 +10999,23 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX9-NEXT:    v_perm_b32 v2, s57, v3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s30, v4, 14
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    v_readlane_b32 s55, v4, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v4, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v4, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v4, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v4, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v4, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v4, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v4, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v4, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v4, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v4, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v4, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v4, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v4, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v4, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v4, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v4, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v4, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v4, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v4, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v4, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v4, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v4, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v4, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v4, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v4, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v4, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v4, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11078,17 +11078,17 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 8
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -11210,7 +11210,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    v_mov_b32_e32 v12, 0xc0c0004
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_perm_b32 v2, s39, s40, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s37, s36, v12
@@ -11274,19 +11274,19 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v14, v19, v17
 ; GFX11-NEXT:    v_or_b32_e32 v15, v21, v18
 ; GFX11-NEXT:    v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 7
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 1
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -11359,7 +11359,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
+define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13635,7 +13635,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15272,7 +15272,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) {
+define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15398,7 +15398,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15453,8 +15453,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB29_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15514,8 +15514,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB29_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15575,8 +15575,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB29_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15643,7 +15643,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) {
+define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15781,7 +15781,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15982,7 +15982,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) {
+define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16108,7 +16108,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16163,8 +16163,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB33_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16224,8 +16224,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB33_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16285,8 +16285,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB33_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16353,7 +16353,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) {
+define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16455,7 +16455,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16502,8 +16502,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16555,8 +16555,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16608,8 +16608,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB35_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16668,7 +16668,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) {
+define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16893,7 +16893,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17109,8 +17109,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB37_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17170,8 +17170,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB37_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17238,7 +17238,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
+define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17574,7 +17574,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17941,8 +17941,8 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB39_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18009,7 +18009,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) {
+define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18234,7 +18234,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18450,8 +18450,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB41_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18511,8 +18511,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB41_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18579,7 +18579,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) {
+define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18979,7 +18979,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19349,8 +19349,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB43_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19411,8 +19411,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB43_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19479,7 +19479,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19784,7 +19784,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20064,8 +20064,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB45_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20125,8 +20125,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB45_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20193,7 +20193,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
+define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21599,7 +21599,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22179,8 +22179,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22530,8 +22530,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB47_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23245,7 +23245,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
+define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24696,7 +24696,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24704,40 +24704,40 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s37, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s36, v1
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s82, s37, 24
@@ -25152,37 +25152,37 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v40, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 29
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25195,27 +25195,27 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -25551,26 +25551,26 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; VI-NEXT:    v_perm_b32 v1, v1, v29, s4
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -25624,23 +25624,23 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -25970,22 +25970,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -26036,18 +26036,18 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB49_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -26315,21 +26315,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -26352,7 +26352,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
+define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28628,7 +28628,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30265,7 +30265,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) {
+define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30403,7 +30403,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30603,7 +30603,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) {
+define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30705,7 +30705,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30752,8 +30752,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30805,8 +30805,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30858,8 +30858,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB55_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30918,7 +30918,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
+define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31155,7 +31155,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31452,7 +31452,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
+define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31788,7 +31788,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32155,8 +32155,8 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB59_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32223,7 +32223,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
+define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32460,7 +32460,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32757,7 +32757,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
+define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33157,7 +33157,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33527,8 +33527,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB63_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33589,8 +33589,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB63_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33657,7 +33657,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33974,7 +33974,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34319,7 +34319,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
+define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35725,7 +35725,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36305,8 +36305,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB67_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -36656,8 +36656,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB67_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37371,7 +37371,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
+define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38848,7 +38848,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38856,40 +38856,40 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_writelane_b32 v4, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
+; SI-NEXT:    v_writelane_b32 v4, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 24
@@ -39210,37 +39210,37 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 29
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39303,30 +39303,30 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -39571,27 +39571,27 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39654,26 +39654,26 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v4, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v4, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v4, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v4, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v4, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v4, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v4, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v4, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v4, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v4, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v4, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v4, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v4, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v4, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v4, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v4, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v4, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v4, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v4, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v4, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v4, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v4, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v4, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v4, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v4, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v4, s55, 13
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v4, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v4, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_writelane_b32 v4, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v4, s31, 15
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s56, s5, 24
@@ -39903,23 +39903,23 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX9-NEXT:    v_perm_b32 v2, s57, v3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s30, v4, 14
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    v_readlane_b32 s55, v4, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v4, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v4, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v4, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v4, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v4, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v4, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v4, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v4, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v4, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v4, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v4, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v4, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v4, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v4, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v4, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v4, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v4, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v4, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v4, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v4, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v4, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v4, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v4, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v4, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v4, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v4, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v4, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39982,17 +39982,17 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 8
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -40114,7 +40114,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    v_mov_b32_e32 v12, 0xc0c0004
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_perm_b32 v2, s39, s40, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s37, s36, v12
@@ -40178,19 +40178,19 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    v_or_b32_e32 v14, v19, v17
 ; GFX11-NEXT:    v_or_b32_e32 v15, v21, v18
 ; GFX11-NEXT:    v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 7
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 1
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -40263,7 +40263,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
+define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42539,7 +42539,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44176,7 +44176,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
+define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44377,7 +44377,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44577,8 +44577,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB73_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44630,8 +44630,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB73_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44690,7 +44690,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
+define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45026,7 +45026,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45393,8 +45393,8 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB75_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45461,7 +45461,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
+define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45662,7 +45662,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45862,8 +45862,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB77_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45915,8 +45915,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB77_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45975,7 +45975,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
+define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46375,7 +46375,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46745,8 +46745,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB79_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -46807,8 +46807,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB79_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -46875,7 +46875,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47148,7 +47148,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47412,8 +47412,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB81_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47465,8 +47465,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB81_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47525,7 +47525,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
+define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -48931,7 +48931,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49511,8 +49511,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB83_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -49862,8 +49862,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB83_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -50577,7 +50577,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
+define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52004,7 +52004,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52012,42 +52012,42 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s31, 31
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 24
@@ -52446,39 +52446,39 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v40, 30
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 31
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -52491,27 +52491,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -52832,26 +52832,26 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v3, v3, v36, s4
 ; VI-NEXT:    v_perm_b32 v1, v1, v31, s4
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v5, v35, v5, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -52920,23 +52920,23 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -53254,22 +53254,22 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v36, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v31, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v35, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -53333,18 +53333,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB85_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s90, s27, 24
@@ -53604,21 +53604,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v8
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX11-NEXT:    v_or_b32_e32 v4, v12, v11
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[15:18], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -53641,7 +53641,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
+define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -55917,7 +55917,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57554,7 +57554,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) {
+define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58002,7 +58002,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58010,16 +58010,16 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v16, s30, 0
-; SI-NEXT:    v_writelane_b32 v16, s31, 1
-; SI-NEXT:    v_writelane_b32 v16, s34, 2
-; SI-NEXT:    v_writelane_b32 v16, s35, 3
-; SI-NEXT:    v_writelane_b32 v16, s36, 4
-; SI-NEXT:    v_writelane_b32 v16, s37, 5
-; SI-NEXT:    v_writelane_b32 v16, s38, 6
-; SI-NEXT:    v_writelane_b32 v16, s39, 7
+; SI-NEXT:    v_writelane_b32 v16, s34, 0
+; SI-NEXT:    v_writelane_b32 v16, s35, 1
+; SI-NEXT:    v_writelane_b32 v16, s36, 2
+; SI-NEXT:    v_writelane_b32 v16, s37, 3
+; SI-NEXT:    v_writelane_b32 v16, s38, 4
+; SI-NEXT:    v_writelane_b32 v16, s39, 5
+; SI-NEXT:    v_writelane_b32 v16, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s37, v1
 ; SI-NEXT:    v_readfirstlane_b32 s39, v0
+; SI-NEXT:    v_writelane_b32 v16, s31, 7
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s36, s28, 16
 ; SI-NEXT:    s_lshr_b32 s91, s27, 16
@@ -58251,6 +58251,7 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s19, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s93, 16
 ; SI-NEXT:    s_or_b32 s19, s19, s20
+; SI-NEXT:    v_readlane_b32 s30, v16, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s15
 ; SI-NEXT:    v_mov_b32_e32 v2, s12
@@ -58267,14 +58268,13 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v13, s17
 ; SI-NEXT:    v_mov_b32_e32 v14, s18
 ; SI-NEXT:    v_mov_b32_e32 v15, s19
-; SI-NEXT:    v_readlane_b32 s39, v16, 7
-; SI-NEXT:    v_readlane_b32 s38, v16, 6
-; SI-NEXT:    v_readlane_b32 s37, v16, 5
-; SI-NEXT:    v_readlane_b32 s36, v16, 4
-; SI-NEXT:    v_readlane_b32 s35, v16, 3
-; SI-NEXT:    v_readlane_b32 s34, v16, 2
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
-; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 7
+; SI-NEXT:    v_readlane_b32 s39, v16, 5
+; SI-NEXT:    v_readlane_b32 s38, v16, 4
+; SI-NEXT:    v_readlane_b32 s37, v16, 3
+; SI-NEXT:    v_readlane_b32 s36, v16, 2
+; SI-NEXT:    v_readlane_b32 s35, v16, 1
+; SI-NEXT:    v_readlane_b32 s34, v16, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58464,8 +58464,8 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB89_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58532,7 +58532,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) {
+define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58884,7 +58884,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59253,8 +59253,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB91_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    v_readlane_b32 s30, v18, 0
+; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -59315,8 +59315,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB91_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -59383,7 +59383,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59778,7 +59778,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59786,16 +59786,16 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v17, s30, 0
-; SI-NEXT:    v_writelane_b32 v17, s31, 1
-; SI-NEXT:    v_writelane_b32 v17, s34, 2
-; SI-NEXT:    v_writelane_b32 v17, s35, 3
-; SI-NEXT:    v_writelane_b32 v17, s36, 4
-; SI-NEXT:    v_writelane_b32 v17, s37, 5
-; SI-NEXT:    v_writelane_b32 v17, s38, 6
-; SI-NEXT:    v_writelane_b32 v17, s39, 7
+; SI-NEXT:    v_writelane_b32 v17, s34, 0
+; SI-NEXT:    v_writelane_b32 v17, s35, 1
+; SI-NEXT:    v_writelane_b32 v17, s36, 2
+; SI-NEXT:    v_writelane_b32 v17, s37, 3
+; SI-NEXT:    v_writelane_b32 v17, s38, 4
+; SI-NEXT:    v_writelane_b32 v17, s39, 5
+; SI-NEXT:    v_writelane_b32 v17, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s39, v1
 ; SI-NEXT:    v_readfirstlane_b32 s37, v0
+; SI-NEXT:    v_writelane_b32 v17, s31, 7
 ; SI-NEXT:    s_lshr_b32 s35, s29, 16
 ; SI-NEXT:    s_lshr_b32 s34, s28, 16
 ; SI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -60026,15 +60026,15 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s9
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s6
+; SI-NEXT:    v_readlane_b32 s30, v17, 6
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT:    v_readlane_b32 s39, v17, 7
-; SI-NEXT:    v_readlane_b32 s38, v17, 6
-; SI-NEXT:    v_readlane_b32 s37, v17, 5
-; SI-NEXT:    v_readlane_b32 s36, v17, 4
-; SI-NEXT:    v_readlane_b32 s35, v17, 3
-; SI-NEXT:    v_readlane_b32 s34, v17, 2
-; SI-NEXT:    v_readlane_b32 s31, v17, 1
-; SI-NEXT:    v_readlane_b32 s30, v17, 0
+; SI-NEXT:    v_readlane_b32 s31, v17, 7
+; SI-NEXT:    v_readlane_b32 s39, v17, 5
+; SI-NEXT:    v_readlane_b32 s38, v17, 4
+; SI-NEXT:    v_readlane_b32 s37, v17, 3
+; SI-NEXT:    v_readlane_b32 s36, v17, 2
+; SI-NEXT:    v_readlane_b32 s35, v17, 1
+; SI-NEXT:    v_readlane_b32 s34, v17, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -60240,8 +60240,8 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB93_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -60308,7 +60308,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
+define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -61782,7 +61782,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -62454,8 +62454,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB95_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -62789,8 +62789,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB95_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -63428,7 +63428,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
+define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -65392,7 +65392,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -65401,43 +65401,42 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
-; SI-NEXT:    v_writelane_b32 v4, s86, 30
-; SI-NEXT:    v_writelane_b32 v4, s87, 31
-; SI-NEXT:    v_writelane_b32 v4, s96, 32
-; SI-NEXT:    v_writelane_b32 v4, s97, 33
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
+; SI-NEXT:    v_writelane_b32 v4, s86, 28
+; SI-NEXT:    v_writelane_b32 v4, s87, 29
+; SI-NEXT:    v_writelane_b32 v4, s96, 30
+; SI-NEXT:    v_writelane_b32 v4, s97, 31
+; SI-NEXT:    v_writelane_b32 v4, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s56, v2
 ; SI-NEXT:    v_readfirstlane_b32 s60, v1
-; SI-NEXT:    v_writelane_b32 v4, s98, 34
+; SI-NEXT:    v_writelane_b32 v4, s99, 33
 ; SI-NEXT:    s_lshr_b32 s68, s29, 16
 ; SI-NEXT:    s_lshr_b32 s46, s28, 16
 ; SI-NEXT:    s_lshr_b32 s70, s27, 16
@@ -65455,8 +65454,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    s_lshr_b32 s69, s56, 16
 ; SI-NEXT:    s_lshr_b32 s58, s60, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    v_writelane_b32 v4, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v4, s99, 35
+; SI-NEXT:    v_writelane_b32 v4, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -65913,44 +65913,44 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 34
 ; SI-NEXT:    v_readlane_b32 s19, v5, 9
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v4, 35
-; SI-NEXT:    v_readlane_b32 s98, v4, 34
-; SI-NEXT:    v_readlane_b32 s97, v4, 33
-; SI-NEXT:    v_readlane_b32 s96, v4, 32
-; SI-NEXT:    v_readlane_b32 s87, v4, 31
-; SI-NEXT:    v_readlane_b32 s86, v4, 30
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 35
+; SI-NEXT:    v_readlane_b32 s99, v4, 33
+; SI-NEXT:    v_readlane_b32 s98, v4, 32
+; SI-NEXT:    v_readlane_b32 s97, v4, 31
+; SI-NEXT:    v_readlane_b32 s96, v4, 30
+; SI-NEXT:    v_readlane_b32 s87, v4, 29
+; SI-NEXT:    v_readlane_b32 s86, v4, 28
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -66025,30 +66025,30 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -66357,27 +66357,27 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66440,23 +66440,23 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -66786,22 +66786,22 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -66852,18 +66852,18 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB97_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -67131,21 +67131,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -67168,7 +67168,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
+define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -69647,7 +69647,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -69656,15 +69656,44 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s36, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s89, v30
 ; SI-NEXT:    v_readfirstlane_b32 s90, v29
 ; SI-NEXT:    v_readfirstlane_b32 s88, v27
@@ -69690,43 +69719,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s28, 0
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s23, 1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s22, 2
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s21, 3
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s20, 4
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_mov_b32 s28, s25
 ; SI-NEXT:    s_mov_b32 s29, s24
 ; SI-NEXT:    v_readfirstlane_b32 s94, v23
@@ -70365,6 +70365,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s45, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s20
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -70381,42 +70382,41 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v13, s7
 ; SI-NEXT:    v_mov_b32_e32 v14, s4
 ; SI-NEXT:    v_mov_b32_e32 v15, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -71635,7 +71635,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -72128,7 +72128,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -72136,14 +72136,14 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_readfirstlane_b32 s60, v1
 ; SI-NEXT:    v_readfirstlane_b32 s56, v0
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s42, s28, 16
 ; SI-NEXT:    s_lshr_b32 s41, s27, 16
@@ -72161,9 +72161,9 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    s_lshr_b32 s63, s60, 16
 ; SI-NEXT:    s_lshr_b32 s59, s56, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s30, 6
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s31, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB101_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshl_b32 s44, s16, 16
@@ -72460,15 +72460,15 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[36:37], 16
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[38:39], 16
 ; SI-NEXT:    v_lshr_b64 v[14:15], v[48:49], 16
+; SI-NEXT:    v_readlane_b32 s30, v40, 6
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[50:51], 16
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 7
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72593,8 +72593,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB101_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    v_readlane_b32 s30, v18, 0
+; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72655,8 +72655,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB101_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72723,7 +72723,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) {
+define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -74236,7 +74236,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -75135,8 +75135,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB103_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -75486,8 +75486,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB103_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -76192,7 +76192,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
+define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78024,7 +78024,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78033,44 +78033,42 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v18, s30, 0
-; SI-NEXT:    v_writelane_b32 v18, s31, 1
-; SI-NEXT:    v_writelane_b32 v18, s34, 2
-; SI-NEXT:    v_writelane_b32 v18, s35, 3
-; SI-NEXT:    v_writelane_b32 v18, s36, 4
-; SI-NEXT:    v_writelane_b32 v18, s37, 5
-; SI-NEXT:    v_writelane_b32 v18, s38, 6
-; SI-NEXT:    v_writelane_b32 v18, s39, 7
-; SI-NEXT:    v_writelane_b32 v18, s48, 8
-; SI-NEXT:    v_writelane_b32 v18, s49, 9
-; SI-NEXT:    v_writelane_b32 v18, s50, 10
-; SI-NEXT:    v_writelane_b32 v18, s51, 11
-; SI-NEXT:    v_writelane_b32 v18, s52, 12
-; SI-NEXT:    v_writelane_b32 v18, s53, 13
-; SI-NEXT:    v_writelane_b32 v18, s54, 14
-; SI-NEXT:    v_writelane_b32 v18, s55, 15
-; SI-NEXT:    v_writelane_b32 v18, s64, 16
-; SI-NEXT:    v_writelane_b32 v18, s65, 17
-; SI-NEXT:    v_writelane_b32 v18, s66, 18
-; SI-NEXT:    v_writelane_b32 v18, s67, 19
-; SI-NEXT:    v_writelane_b32 v18, s68, 20
-; SI-NEXT:    v_writelane_b32 v18, s69, 21
-; SI-NEXT:    v_writelane_b32 v18, s70, 22
-; SI-NEXT:    v_writelane_b32 v18, s71, 23
-; SI-NEXT:    v_writelane_b32 v18, s80, 24
-; SI-NEXT:    v_writelane_b32 v18, s81, 25
-; SI-NEXT:    v_writelane_b32 v18, s82, 26
-; SI-NEXT:    v_writelane_b32 v18, s83, 27
-; SI-NEXT:    v_writelane_b32 v18, s84, 28
-; SI-NEXT:    v_writelane_b32 v18, s85, 29
-; SI-NEXT:    v_writelane_b32 v18, s86, 30
-; SI-NEXT:    v_writelane_b32 v18, s87, 31
-; SI-NEXT:    v_writelane_b32 v18, s96, 32
-; SI-NEXT:    v_writelane_b32 v18, s97, 33
-; SI-NEXT:    v_writelane_b32 v18, s98, 34
+; SI-NEXT:    v_writelane_b32 v18, s34, 0
+; SI-NEXT:    v_writelane_b32 v18, s35, 1
+; SI-NEXT:    v_writelane_b32 v18, s36, 2
+; SI-NEXT:    v_writelane_b32 v18, s37, 3
+; SI-NEXT:    v_writelane_b32 v18, s38, 4
+; SI-NEXT:    v_writelane_b32 v18, s39, 5
+; SI-NEXT:    v_writelane_b32 v18, s48, 6
+; SI-NEXT:    v_writelane_b32 v18, s49, 7
+; SI-NEXT:    v_writelane_b32 v18, s50, 8
+; SI-NEXT:    v_writelane_b32 v18, s51, 9
+; SI-NEXT:    v_writelane_b32 v18, s52, 10
+; SI-NEXT:    v_writelane_b32 v18, s53, 11
+; SI-NEXT:    v_writelane_b32 v18, s54, 12
+; SI-NEXT:    v_writelane_b32 v18, s55, 13
+; SI-NEXT:    v_writelane_b32 v18, s64, 14
+; SI-NEXT:    v_writelane_b32 v18, s65, 15
+; SI-NEXT:    v_writelane_b32 v18, s66, 16
+; SI-NEXT:    v_writelane_b32 v18, s67, 17
+; SI-NEXT:    v_writelane_b32 v18, s68, 18
+; SI-NEXT:    v_writelane_b32 v18, s69, 19
+; SI-NEXT:    v_writelane_b32 v18, s70, 20
+; SI-NEXT:    v_writelane_b32 v18, s71, 21
+; SI-NEXT:    v_writelane_b32 v18, s80, 22
+; SI-NEXT:    v_writelane_b32 v18, s81, 23
+; SI-NEXT:    v_writelane_b32 v18, s82, 24
+; SI-NEXT:    v_writelane_b32 v18, s83, 25
+; SI-NEXT:    v_writelane_b32 v18, s84, 26
+; SI-NEXT:    v_writelane_b32 v18, s85, 27
+; SI-NEXT:    v_writelane_b32 v18, s86, 28
+; SI-NEXT:    v_writelane_b32 v18, s87, 29
+; SI-NEXT:    v_writelane_b32 v18, s96, 30
+; SI-NEXT:    v_writelane_b32 v18, s97, 31
+; SI-NEXT:    v_writelane_b32 v18, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s98, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v18, s99, 35
+; SI-NEXT:    v_writelane_b32 v18, s99, 33
 ; SI-NEXT:    s_lshr_b32 s96, s29, 16
 ; SI-NEXT:    s_lshr_b32 s97, s28, 16
 ; SI-NEXT:    s_lshr_b32 s86, s27, 16
@@ -78088,7 +78086,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    s_lshr_b32 s99, s98, 16
 ; SI-NEXT:    s_lshr_b32 s46, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    v_writelane_b32 v18, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    v_writelane_b32 v18, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -78672,43 +78672,43 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v18, 34
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v18, 35
-; SI-NEXT:    v_readlane_b32 s98, v18, 34
-; SI-NEXT:    v_readlane_b32 s97, v18, 33
-; SI-NEXT:    v_readlane_b32 s96, v18, 32
-; SI-NEXT:    v_readlane_b32 s87, v18, 31
-; SI-NEXT:    v_readlane_b32 s86, v18, 30
-; SI-NEXT:    v_readlane_b32 s85, v18, 29
-; SI-NEXT:    v_readlane_b32 s84, v18, 28
-; SI-NEXT:    v_readlane_b32 s83, v18, 27
-; SI-NEXT:    v_readlane_b32 s82, v18, 26
-; SI-NEXT:    v_readlane_b32 s81, v18, 25
-; SI-NEXT:    v_readlane_b32 s80, v18, 24
-; SI-NEXT:    v_readlane_b32 s71, v18, 23
-; SI-NEXT:    v_readlane_b32 s70, v18, 22
-; SI-NEXT:    v_readlane_b32 s69, v18, 21
-; SI-NEXT:    v_readlane_b32 s68, v18, 20
-; SI-NEXT:    v_readlane_b32 s67, v18, 19
-; SI-NEXT:    v_readlane_b32 s66, v18, 18
-; SI-NEXT:    v_readlane_b32 s65, v18, 17
-; SI-NEXT:    v_readlane_b32 s64, v18, 16
-; SI-NEXT:    v_readlane_b32 s55, v18, 15
-; SI-NEXT:    v_readlane_b32 s54, v18, 14
-; SI-NEXT:    v_readlane_b32 s53, v18, 13
-; SI-NEXT:    v_readlane_b32 s52, v18, 12
-; SI-NEXT:    v_readlane_b32 s51, v18, 11
-; SI-NEXT:    v_readlane_b32 s50, v18, 10
-; SI-NEXT:    v_readlane_b32 s49, v18, 9
-; SI-NEXT:    v_readlane_b32 s48, v18, 8
-; SI-NEXT:    v_readlane_b32 s39, v18, 7
-; SI-NEXT:    v_readlane_b32 s38, v18, 6
-; SI-NEXT:    v_readlane_b32 s37, v18, 5
-; SI-NEXT:    v_readlane_b32 s36, v18, 4
-; SI-NEXT:    v_readlane_b32 s35, v18, 3
-; SI-NEXT:    v_readlane_b32 s34, v18, 2
-; SI-NEXT:    v_readlane_b32 s31, v18, 1
-; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    v_readlane_b32 s31, v18, 35
+; SI-NEXT:    v_readlane_b32 s99, v18, 33
+; SI-NEXT:    v_readlane_b32 s98, v18, 32
+; SI-NEXT:    v_readlane_b32 s97, v18, 31
+; SI-NEXT:    v_readlane_b32 s96, v18, 30
+; SI-NEXT:    v_readlane_b32 s87, v18, 29
+; SI-NEXT:    v_readlane_b32 s86, v18, 28
+; SI-NEXT:    v_readlane_b32 s85, v18, 27
+; SI-NEXT:    v_readlane_b32 s84, v18, 26
+; SI-NEXT:    v_readlane_b32 s83, v18, 25
+; SI-NEXT:    v_readlane_b32 s82, v18, 24
+; SI-NEXT:    v_readlane_b32 s81, v18, 23
+; SI-NEXT:    v_readlane_b32 s80, v18, 22
+; SI-NEXT:    v_readlane_b32 s71, v18, 21
+; SI-NEXT:    v_readlane_b32 s70, v18, 20
+; SI-NEXT:    v_readlane_b32 s69, v18, 19
+; SI-NEXT:    v_readlane_b32 s68, v18, 18
+; SI-NEXT:    v_readlane_b32 s67, v18, 17
+; SI-NEXT:    v_readlane_b32 s66, v18, 16
+; SI-NEXT:    v_readlane_b32 s65, v18, 15
+; SI-NEXT:    v_readlane_b32 s64, v18, 14
+; SI-NEXT:    v_readlane_b32 s55, v18, 13
+; SI-NEXT:    v_readlane_b32 s54, v18, 12
+; SI-NEXT:    v_readlane_b32 s53, v18, 11
+; SI-NEXT:    v_readlane_b32 s52, v18, 10
+; SI-NEXT:    v_readlane_b32 s51, v18, 9
+; SI-NEXT:    v_readlane_b32 s50, v18, 8
+; SI-NEXT:    v_readlane_b32 s49, v18, 7
+; SI-NEXT:    v_readlane_b32 s48, v18, 6
+; SI-NEXT:    v_readlane_b32 s39, v18, 5
+; SI-NEXT:    v_readlane_b32 s38, v18, 4
+; SI-NEXT:    v_readlane_b32 s37, v18, 3
+; SI-NEXT:    v_readlane_b32 s36, v18, 2
+; SI-NEXT:    v_readlane_b32 s35, v18, 1
+; SI-NEXT:    v_readlane_b32 s34, v18, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -78722,27 +78722,27 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -79137,26 +79137,26 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v2, v29, v2, s4
 ; VI-NEXT:    v_perm_b32 v1, v50, v1, s4
@@ -79197,23 +79197,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -79544,22 +79544,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -79610,18 +79610,18 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB105_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -79889,21 +79889,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -79926,7 +79926,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
+define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82405,7 +82405,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82414,15 +82414,44 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s36, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s89, v30
 ; SI-NEXT:    v_readfirstlane_b32 s90, v29
 ; SI-NEXT:    v_readfirstlane_b32 s88, v27
@@ -82448,43 +82477,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s28, 0
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s23, 1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s22, 2
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s21, 3
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s20, 4
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_mov_b32 s28, s25
 ; SI-NEXT:    s_mov_b32 s29, s24
 ; SI-NEXT:    v_readfirstlane_b32 s94, v23
@@ -83123,6 +83123,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s45, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s20
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -83139,42 +83140,41 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v13, s7
 ; SI-NEXT:    v_mov_b32_e32 v14, s4
 ; SI-NEXT:    v_mov_b32_e32 v15, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -84393,7 +84393,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
+define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87369,7 +87369,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87378,42 +87378,42 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s42, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -87447,7 +87447,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    s_and_b32 s45, s44, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s46, v3
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s46, 0
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s41
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s16
@@ -87481,7 +87481,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s44
 ; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s43
 ; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s42
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -87978,44 +87978,44 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_readlane_b32 s75, v41, 1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -88098,27 +88098,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -88749,26 +88749,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v1, v22, v9, s4
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -88801,23 +88801,23 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -89382,22 +89382,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v2, v53, v2, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
@@ -89498,18 +89498,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s27, 24
@@ -90072,21 +90072,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v20, v10
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v7
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[16:19], off offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s0
@@ -90099,18 +90099,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s42, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_3
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s93, s27, 24
@@ -90676,21 +90676,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v22, v23
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v13, v20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[14:17], off offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s0
@@ -90713,7 +90713,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93204,7 +93204,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93213,15 +93213,13 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    s_mov_b32 s6, s19
 ; SI-NEXT:    v_readfirstlane_b32 s19, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_readfirstlane_b32 s8, v30
 ; SI-NEXT:    v_readfirstlane_b32 s37, v29
 ; SI-NEXT:    v_readfirstlane_b32 s47, v27
@@ -93247,51 +93245,52 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s18, 0
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
 ; SI-NEXT:    v_writelane_b32 v41, s6, 1
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
 ; SI-NEXT:    v_writelane_b32 v41, s17, 2
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
 ; SI-NEXT:    v_writelane_b32 v41, s16, 3
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
 ; SI-NEXT:    v_writelane_b32 v41, s22, 4
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v41, s23, 5
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
 ; SI-NEXT:    v_writelane_b32 v41, s21, 6
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
 ; SI-NEXT:    v_writelane_b32 v41, s20, 7
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
 ; SI-NEXT:    v_writelane_b32 v41, s26, 8
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
 ; SI-NEXT:    v_writelane_b32 v41, s27, 9
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
 ; SI-NEXT:    v_writelane_b32 v41, s25, 10
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_writelane_b32 v41, s24, 11
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
 ; SI-NEXT:    v_writelane_b32 v41, s29, 12
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s75, v23
 ; SI-NEXT:    v_readfirstlane_b32 s76, v22
 ; SI-NEXT:    v_readfirstlane_b32 s62, v21
@@ -93320,9 +93319,9 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s87, v24
 ; SI-NEXT:    v_readfirstlane_b32 s86, v25
 ; SI-NEXT:    v_readfirstlane_b32 s98, v26
-; SI-NEXT:    v_readfirstlane_b32 s99, v27
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s99, v27
 ; SI-NEXT:    v_readfirstlane_b32 s82, v29
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s81, v30
@@ -93353,6 +93352,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s51, v51
 ; SI-NEXT:    v_writelane_b32 v41, s28, 13
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_writelane_b32 v41, s7, 14
 ; SI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -93806,43 +93806,43 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s69
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s71
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -95095,3 +95095,5 @@ end:
   %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <32 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 39a3c96b9839a..e04fb2918a8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) {
+define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,7 +148,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -371,7 +371,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) {
+define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -504,7 +504,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -952,7 +952,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) {
+define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1094,7 +1094,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1317,7 +1317,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) {
+define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1464,7 +1464,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1687,7 +1687,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) {
+define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1829,7 +1829,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2052,7 +2052,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) {
+define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2158,7 +2158,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2570,7 +2570,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) {
+define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3100,7 +3100,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3683,7 +3683,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
+define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4424,7 +4424,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5138,7 +5138,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
+define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5668,7 +5668,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6251,7 +6251,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
+define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7066,7 +7066,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7850,7 +7850,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) {
+define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7983,7 +7983,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8431,7 +8431,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) {
+define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8578,7 +8578,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8801,7 +8801,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) {
+define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8934,7 +8934,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9382,7 +9382,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) {
+define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9488,7 +9488,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9900,7 +9900,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) {
+define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10412,7 +10412,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11210,7 +11210,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
+define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11951,7 +11951,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12665,7 +12665,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
+define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13177,7 +13177,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13975,7 +13975,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
+define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14790,7 +14790,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15574,7 +15574,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) {
+define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15721,7 +15721,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15943,7 +15943,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) {
+define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16049,7 +16049,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16461,7 +16461,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) {
+define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17001,7 +17001,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17584,7 +17584,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
+define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18325,7 +18325,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19039,7 +19039,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
+define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19579,7 +19579,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20162,7 +20162,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
+define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20977,7 +20977,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21761,7 +21761,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) {
+define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22246,7 +22246,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22999,7 +22999,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
+define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23740,7 +23740,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24454,7 +24454,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
+define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24939,7 +24939,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25692,7 +25692,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
+define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26507,7 +26507,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27291,7 +27291,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
+define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28045,7 +28045,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28053,26 +28053,26 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v18, s30, 0
-; SI-NEXT:    v_writelane_b32 v18, s31, 1
-; SI-NEXT:    v_writelane_b32 v18, s34, 2
-; SI-NEXT:    v_writelane_b32 v18, s35, 3
-; SI-NEXT:    v_writelane_b32 v18, s36, 4
-; SI-NEXT:    v_writelane_b32 v18, s37, 5
-; SI-NEXT:    v_writelane_b32 v18, s38, 6
-; SI-NEXT:    v_writelane_b32 v18, s39, 7
-; SI-NEXT:    v_writelane_b32 v18, s48, 8
-; SI-NEXT:    v_writelane_b32 v18, s49, 9
-; SI-NEXT:    v_writelane_b32 v18, s50, 10
-; SI-NEXT:    v_writelane_b32 v18, s51, 11
-; SI-NEXT:    v_writelane_b32 v18, s52, 12
-; SI-NEXT:    v_writelane_b32 v18, s53, 13
-; SI-NEXT:    v_writelane_b32 v18, s54, 14
-; SI-NEXT:    v_writelane_b32 v18, s55, 15
+; SI-NEXT:    v_writelane_b32 v18, s34, 0
+; SI-NEXT:    v_writelane_b32 v18, s35, 1
+; SI-NEXT:    v_writelane_b32 v18, s36, 2
+; SI-NEXT:    v_writelane_b32 v18, s37, 3
+; SI-NEXT:    v_writelane_b32 v18, s38, 4
+; SI-NEXT:    v_writelane_b32 v18, s39, 5
+; SI-NEXT:    v_writelane_b32 v18, s48, 6
+; SI-NEXT:    v_writelane_b32 v18, s49, 7
+; SI-NEXT:    v_writelane_b32 v18, s50, 8
+; SI-NEXT:    v_writelane_b32 v18, s51, 9
+; SI-NEXT:    v_writelane_b32 v18, s52, 10
+; SI-NEXT:    v_writelane_b32 v18, s53, 11
+; SI-NEXT:    v_writelane_b32 v18, s54, 12
+; SI-NEXT:    v_writelane_b32 v18, s55, 13
+; SI-NEXT:    v_writelane_b32 v18, s30, 14
 ; SI-NEXT:    v_readfirstlane_b32 s53, v3
 ; SI-NEXT:    v_readfirstlane_b32 s55, v2
 ; SI-NEXT:    v_readfirstlane_b32 s50, v1
 ; SI-NEXT:    v_readfirstlane_b32 s52, v0
+; SI-NEXT:    v_writelane_b32 v18, s31, 15
 ; SI-NEXT:    s_lshr_b32 s30, s29, 16
 ; SI-NEXT:    s_lshr_b32 s49, s28, 16
 ; SI-NEXT:    s_lshr_b32 s95, s27, 16
@@ -28332,6 +28332,7 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s21, s43, 0xffff
 ; SI-NEXT:    s_lshl_b32 s22, s34, 16
 ; SI-NEXT:    s_or_b32 s21, s21, s22
+; SI-NEXT:    v_readlane_b32 s30, v18, 14
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s14
@@ -28350,22 +28351,21 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v15, s19
 ; SI-NEXT:    v_mov_b32_e32 v16, s20
 ; SI-NEXT:    v_mov_b32_e32 v17, s21
-; SI-NEXT:    v_readlane_b32 s55, v18, 15
-; SI-NEXT:    v_readlane_b32 s54, v18, 14
-; SI-NEXT:    v_readlane_b32 s53, v18, 13
-; SI-NEXT:    v_readlane_b32 s52, v18, 12
-; SI-NEXT:    v_readlane_b32 s51, v18, 11
-; SI-NEXT:    v_readlane_b32 s50, v18, 10
-; SI-NEXT:    v_readlane_b32 s49, v18, 9
-; SI-NEXT:    v_readlane_b32 s48, v18, 8
-; SI-NEXT:    v_readlane_b32 s39, v18, 7
-; SI-NEXT:    v_readlane_b32 s38, v18, 6
-; SI-NEXT:    v_readlane_b32 s37, v18, 5
-; SI-NEXT:    v_readlane_b32 s36, v18, 4
-; SI-NEXT:    v_readlane_b32 s35, v18, 3
-; SI-NEXT:    v_readlane_b32 s34, v18, 2
-; SI-NEXT:    v_readlane_b32 s31, v18, 1
-; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    v_readlane_b32 s31, v18, 15
+; SI-NEXT:    v_readlane_b32 s55, v18, 13
+; SI-NEXT:    v_readlane_b32 s54, v18, 12
+; SI-NEXT:    v_readlane_b32 s53, v18, 11
+; SI-NEXT:    v_readlane_b32 s52, v18, 10
+; SI-NEXT:    v_readlane_b32 s51, v18, 9
+; SI-NEXT:    v_readlane_b32 s50, v18, 8
+; SI-NEXT:    v_readlane_b32 s49, v18, 7
+; SI-NEXT:    v_readlane_b32 s48, v18, 6
+; SI-NEXT:    v_readlane_b32 s39, v18, 5
+; SI-NEXT:    v_readlane_b32 s38, v18, 4
+; SI-NEXT:    v_readlane_b32 s37, v18, 3
+; SI-NEXT:    v_readlane_b32 s36, v18, 2
+; SI-NEXT:    v_readlane_b32 s35, v18, 1
+; SI-NEXT:    v_readlane_b32 s34, v18, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28988,7 +28988,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
+define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29588,7 +29588,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30487,3 +30487,5 @@ end:
   %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <36 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 666b5353465eb..b59e14fe0da33 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) {
+define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,7 +156,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -402,7 +402,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) {
+define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -542,7 +542,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1022,7 +1022,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) {
+define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1172,7 +1172,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1418,7 +1418,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) {
+define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1573,7 +1573,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1819,7 +1819,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) {
+define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1969,7 +1969,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2215,7 +2215,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) {
+define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2325,7 +2325,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2765,7 +2765,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) {
+define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3343,7 +3343,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3989,7 +3989,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
+define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4837,7 +4837,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5105,30 +5105,32 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -5150,9 +5152,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -5318,6 +5318,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5338,26 +5339,25 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5623,7 +5623,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
+define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6201,7 +6201,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6847,7 +6847,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) {
+define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7780,7 +7780,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8122,30 +8122,32 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -8167,9 +8169,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB19_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -8353,26 +8353,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8636,7 +8636,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) {
+define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8776,7 +8776,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9256,7 +9256,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) {
+define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9411,7 +9411,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9657,7 +9657,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) {
+define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9797,7 +9797,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10277,7 +10277,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) {
+define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10387,7 +10387,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10827,7 +10827,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) {
+define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11385,7 +11385,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12269,7 +12269,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
+define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13117,7 +13117,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13385,30 +13385,32 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -13430,9 +13432,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -13598,6 +13598,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -13618,26 +13619,25 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -13903,7 +13903,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
+define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14461,7 +14461,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15345,7 +15345,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) {
+define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16278,7 +16278,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16620,30 +16620,32 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -16665,9 +16667,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB35_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -16851,26 +16851,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17134,7 +17134,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) {
+define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17289,7 +17289,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17534,7 +17534,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) {
+define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17644,7 +17644,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18084,7 +18084,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) {
+define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18672,7 +18672,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19318,7 +19318,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
+define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20166,7 +20166,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20434,30 +20434,32 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -20479,9 +20481,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -20647,6 +20647,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -20667,26 +20668,25 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20952,7 +20952,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
+define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21540,7 +21540,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22186,7 +22186,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) {
+define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23119,7 +23119,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23461,30 +23461,32 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -23506,9 +23508,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB47_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -23692,26 +23692,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23975,7 +23975,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) {
+define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24503,7 +24503,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25337,7 +25337,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
+define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26185,7 +26185,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26453,30 +26453,32 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -26498,9 +26500,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -26666,6 +26666,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26686,26 +26687,25 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26971,7 +26971,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
+define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27499,7 +27499,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28333,7 +28333,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) {
+define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29266,7 +29266,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29608,30 +29608,32 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -29653,9 +29655,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB55_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -29839,26 +29839,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30122,7 +30122,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
+define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30990,7 +30990,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30998,36 +30998,36 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s69, v5
 ; SI-NEXT:    v_readfirstlane_b32 s71, v4
 ; SI-NEXT:    v_readfirstlane_b32 s66, v3
 ; SI-NEXT:    v_readfirstlane_b32 s68, v2
 ; SI-NEXT:    v_readfirstlane_b32 s55, v1
 ; SI-NEXT:    v_readfirstlane_b32 s65, v0
+; SI-NEXT:    v_writelane_b32 v20, s31, 23
 ; SI-NEXT:    s_lshr_b32 s36, s29, 16
 ; SI-NEXT:    s_lshr_b32 s54, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -31315,6 +31315,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s24, s39, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s24
+; SI-NEXT:    v_readlane_b32 s30, v20, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -31335,30 +31336,29 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v17, s7
 ; SI-NEXT:    v_mov_b32_e32 v18, s4
 ; SI-NEXT:    v_mov_b32_e32 v19, s5
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 23
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32047,7 +32047,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
+define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32704,7 +32704,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33710,3 +33710,5 @@ end:
   %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <40 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 45e835ddb0e28..d975358b3570f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define double @bitcast_i64_to_f64(i64 %a, i32 %b) {
+define double @bitcast_i64_to_f64(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -81,7 +81,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -168,7 +168,7 @@ end:
   ret double %phi
 }
 
-define i64 @bitcast_f64_to_i64(double %a, i32 %b) {
+define i64 @bitcast_f64_to_i64(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -239,7 +239,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -325,7 +325,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) {
+define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -487,7 +487,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) {
+define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -561,7 +561,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,7 +648,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) {
+define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -723,7 +723,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -810,7 +810,7 @@ end:
   ret <2 x float> %phi
 }
 
-define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) {
+define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -883,7 +883,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -973,7 +973,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) {
+define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1062,7 +1062,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1161,7 +1161,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) {
+define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,7 +1269,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1382,7 +1382,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) {
+define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1471,7 +1471,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1570,7 +1570,7 @@ end:
   ret <4 x half> %phi
 }
 
-define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) {
+define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1687,7 +1687,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1810,7 +1810,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) {
+define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1909,7 +1909,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
+define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2277,7 +2277,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2560,7 +2560,7 @@ end:
   ret i64 %phi
 }
 
-define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) {
+define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2772,7 +2772,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2952,7 +2952,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
+define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3285,7 +3285,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3509,7 +3509,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) {
+define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3580,7 +3580,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3666,7 +3666,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) {
+define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3740,7 +3740,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3827,7 +3827,7 @@ end:
   ret double %phi
 }
 
-define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) {
+define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3898,7 +3898,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3984,7 +3984,7 @@ end:
   ret <2 x float> %phi
 }
 
-define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) {
+define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4057,7 +4057,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4147,7 +4147,7 @@ end:
   ret double %phi
 }
 
-define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) {
+define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4232,7 +4232,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4333,7 +4333,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) {
+define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4441,7 +4441,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4554,7 +4554,7 @@ end:
   ret double %phi
 }
 
-define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) {
+define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4639,7 +4639,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4740,7 +4740,7 @@ end:
   ret <4 x half> %phi
 }
 
-define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) {
+define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4857,7 +4857,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4980,7 +4980,7 @@ end:
   ret double %phi
 }
 
-define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) {
+define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5074,7 +5074,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5183,7 +5183,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
+define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5446,7 +5446,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5729,7 +5729,7 @@ end:
   ret double %phi
 }
 
-define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) {
+define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5934,7 +5934,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6128,7 +6128,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
+define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6461,7 +6461,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6685,7 +6685,7 @@ end:
   ret double %phi
 }
 
-define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) {
+define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6759,7 +6759,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6846,7 +6846,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) {
+define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6919,7 +6919,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7009,7 +7009,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) {
+define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7097,7 +7097,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7196,7 +7196,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) {
+define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7304,7 +7304,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7417,7 +7417,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) {
+define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7505,7 +7505,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7604,7 +7604,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) {
+define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7721,7 +7721,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7844,7 +7844,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7942,7 +7942,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8047,7 +8047,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
+define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8310,7 +8310,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8593,7 +8593,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) {
+define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8803,7 +8803,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8983,7 +8983,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
+define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9316,7 +9316,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9540,7 +9540,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) {
+define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9627,7 +9627,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9732,7 +9732,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) {
+define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9840,7 +9840,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9953,7 +9953,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) {
+define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10040,7 +10040,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10145,7 +10145,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) {
+define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10262,7 +10262,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10385,7 +10385,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10482,7 +10482,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10595,7 +10595,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
+define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10858,7 +10858,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11141,7 +11141,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) {
+define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11349,7 +11349,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11547,7 +11547,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
+define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11880,7 +11880,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12104,7 +12104,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) {
+define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12216,7 +12216,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12340,7 +12340,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) {
+define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12444,7 +12444,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12570,7 +12570,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12680,7 +12680,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12804,7 +12804,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
+define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13069,7 +13069,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13350,7 +13350,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
+define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13587,7 +13587,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13801,7 +13801,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
+define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14142,7 +14142,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14379,7 +14379,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14498,7 +14498,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14633,7 +14633,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
+define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14907,7 +14907,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15203,7 +15203,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) {
+define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15444,7 +15444,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15668,7 +15668,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
+define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16009,7 +16009,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16246,7 +16246,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
+define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16626,7 +16626,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17028,7 +17028,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17370,7 +17370,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17604,3 +17604,5 @@ end:
   %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <4 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index d2f29b856a013..f1c80ed5d2873 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) {
+define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -433,7 +433,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) {
+define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -580,7 +580,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1076,7 +1076,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) {
+define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1234,7 +1234,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,7 +1503,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) {
+define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1667,7 +1667,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1936,7 +1936,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) {
+define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2094,7 +2094,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2363,7 +2363,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) {
+define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2477,7 +2477,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2929,7 +2929,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) {
+define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3555,7 +3555,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4264,7 +4264,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
+define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5212,7 +5212,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5220,23 +5220,25 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -5245,7 +5247,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -5269,9 +5271,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -5453,6 +5453,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5475,26 +5476,25 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5510,29 +5510,29 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -5541,7 +5541,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -5747,6 +5747,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5769,30 +5770,29 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6074,7 +6074,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
+define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6700,7 +6700,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7409,7 +7409,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) {
+define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8456,7 +8456,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,23 +8464,25 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -8489,7 +8491,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -8513,9 +8515,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -8800,26 +8800,26 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8832,29 +8832,29 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -8863,7 +8863,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -9085,30 +9085,30 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9388,7 +9388,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) {
+define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9535,7 +9535,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10031,7 +10031,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) {
+define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10195,7 +10195,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10464,7 +10464,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) {
+define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10611,7 +10611,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11107,7 +11107,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) {
+define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11221,7 +11221,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11673,7 +11673,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) {
+define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12277,7 +12277,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13247,7 +13247,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
+define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14195,7 +14195,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14203,23 +14203,25 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -14228,7 +14230,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -14252,9 +14254,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -14436,6 +14436,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -14458,26 +14459,25 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14493,29 +14493,29 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -14524,7 +14524,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -14730,6 +14730,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -14752,30 +14753,29 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15057,7 +15057,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
+define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15661,7 +15661,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16631,7 +16631,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) {
+define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17678,7 +17678,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17686,23 +17686,25 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -17711,7 +17713,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -17735,9 +17737,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -18022,26 +18022,26 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18054,29 +18054,29 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -18085,7 +18085,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -18307,30 +18307,30 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18610,7 +18610,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) {
+define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18774,7 +18774,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19042,7 +19042,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) {
+define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19156,7 +19156,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19608,7 +19608,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) {
+define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20246,7 +20246,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20955,7 +20955,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
+define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21903,7 +21903,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21911,23 +21911,25 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -21936,7 +21938,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -21960,9 +21962,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -22144,6 +22144,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -22166,26 +22167,25 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22201,29 +22201,29 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -22232,7 +22232,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -22438,6 +22438,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -22460,30 +22461,29 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22765,7 +22765,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
+define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23403,7 +23403,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24112,7 +24112,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) {
+define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25159,7 +25159,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25167,23 +25167,25 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -25192,7 +25194,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -25216,9 +25218,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -25503,26 +25503,26 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25535,29 +25535,29 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -25566,7 +25566,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -25788,30 +25788,30 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26091,7 +26091,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) {
+define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26662,7 +26662,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27577,7 +27577,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
+define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28525,7 +28525,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28533,23 +28533,25 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -28558,7 +28560,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -28582,9 +28584,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -28766,6 +28766,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28788,26 +28789,25 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28823,29 +28823,29 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -28854,7 +28854,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -29060,6 +29060,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -29082,30 +29083,29 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29387,7 +29387,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
+define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29958,7 +29958,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30873,7 +30873,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) {
+define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31920,7 +31920,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31928,23 +31928,25 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -31953,7 +31955,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -31977,9 +31979,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -32264,26 +32264,26 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32296,29 +32296,29 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -32327,7 +32327,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -32549,30 +32549,30 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32852,7 +32852,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
+define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33834,7 +33834,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33842,36 +33842,35 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
-; SI-NEXT:    v_writelane_b32 v22, s68, 20
-; SI-NEXT:    v_writelane_b32 v22, s69, 21
-; SI-NEXT:    v_writelane_b32 v22, s70, 22
-; SI-NEXT:    v_writelane_b32 v22, s71, 23
-; SI-NEXT:    v_writelane_b32 v22, s80, 24
-; SI-NEXT:    v_writelane_b32 v22, s81, 25
-; SI-NEXT:    v_writelane_b32 v22, s82, 26
-; SI-NEXT:    v_writelane_b32 v22, s83, 27
-; SI-NEXT:    v_writelane_b32 v22, s84, 28
-; SI-NEXT:    v_writelane_b32 v22, s85, 29
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s68, 18
+; SI-NEXT:    v_writelane_b32 v22, s69, 19
+; SI-NEXT:    v_writelane_b32 v22, s70, 20
+; SI-NEXT:    v_writelane_b32 v22, s71, 21
+; SI-NEXT:    v_writelane_b32 v22, s80, 22
+; SI-NEXT:    v_writelane_b32 v22, s81, 23
+; SI-NEXT:    v_writelane_b32 v22, s82, 24
+; SI-NEXT:    v_writelane_b32 v22, s83, 25
+; SI-NEXT:    v_writelane_b32 v22, s84, 26
+; SI-NEXT:    v_writelane_b32 v22, s85, 27
+; SI-NEXT:    v_writelane_b32 v22, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s83, v7
 ; SI-NEXT:    v_readfirstlane_b32 s85, v6
 ; SI-NEXT:    v_readfirstlane_b32 s80, v5
@@ -33880,6 +33879,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s71, v2
 ; SI-NEXT:    v_readfirstlane_b32 s66, v1
 ; SI-NEXT:    v_readfirstlane_b32 s68, v0
+; SI-NEXT:    v_writelane_b32 v22, s31, 29
 ; SI-NEXT:    s_lshr_b32 s38, s29, 16
 ; SI-NEXT:    s_lshr_b32 s65, s28, 16
 ; SI-NEXT:    s_lshr_b32 s37, s27, 16
@@ -34195,6 +34195,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; SI-NEXT:    s_lshl_b32 s26, s50, 16
 ; SI-NEXT:    s_or_b32 s15, s15, s26
+; SI-NEXT:    v_readlane_b32 s30, v22, 28
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -34217,36 +34218,35 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v19, s25
 ; SI-NEXT:    v_mov_b32_e32 v20, s14
 ; SI-NEXT:    v_mov_b32_e32 v21, s15
-; SI-NEXT:    v_readlane_b32 s85, v22, 29
-; SI-NEXT:    v_readlane_b32 s84, v22, 28
-; SI-NEXT:    v_readlane_b32 s83, v22, 27
-; SI-NEXT:    v_readlane_b32 s82, v22, 26
-; SI-NEXT:    v_readlane_b32 s81, v22, 25
-; SI-NEXT:    v_readlane_b32 s80, v22, 24
-; SI-NEXT:    v_readlane_b32 s71, v22, 23
-; SI-NEXT:    v_readlane_b32 s70, v22, 22
-; SI-NEXT:    v_readlane_b32 s69, v22, 21
-; SI-NEXT:    v_readlane_b32 s68, v22, 20
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 29
+; SI-NEXT:    v_readlane_b32 s85, v22, 27
+; SI-NEXT:    v_readlane_b32 s84, v22, 26
+; SI-NEXT:    v_readlane_b32 s83, v22, 25
+; SI-NEXT:    v_readlane_b32 s82, v22, 24
+; SI-NEXT:    v_readlane_b32 s81, v22, 23
+; SI-NEXT:    v_readlane_b32 s80, v22, 22
+; SI-NEXT:    v_readlane_b32 s71, v22, 21
+; SI-NEXT:    v_readlane_b32 s70, v22, 20
+; SI-NEXT:    v_readlane_b32 s69, v22, 19
+; SI-NEXT:    v_readlane_b32 s68, v22, 18
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35001,7 +35001,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
+define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35715,7 +35715,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36832,3 +36832,5 @@ end:
   %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <44 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index ba58d8a9fba52..1194fa2305563 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) {
+define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) {
+define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -618,7 +618,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1130,7 +1130,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) {
+define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1296,7 +1296,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1588,7 +1588,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) {
+define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1760,7 +1760,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2052,7 +2052,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) {
+define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2218,7 +2218,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2510,7 +2510,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) {
+define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2628,7 +2628,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3092,7 +3092,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) {
+define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3766,7 +3766,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3774,10 +3774,10 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -3789,7 +3789,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -3939,6 +3939,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -3963,10 +3964,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4554,7 +4554,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
+define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5601,7 +5601,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5609,29 +5609,29 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -5642,7 +5642,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -5866,6 +5866,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5890,30 +5891,29 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5929,33 +5929,33 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -5966,7 +5966,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -6190,6 +6190,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6214,34 +6215,33 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6539,7 +6539,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
+define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7213,7 +7213,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7221,10 +7221,10 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -7236,7 +7236,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -7386,6 +7386,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -7410,10 +7411,9 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8001,7 +8001,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
+define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9160,7 +9160,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9168,29 +9168,29 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -9201,7 +9201,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -9534,30 +9534,30 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9570,33 +9570,33 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -9607,7 +9607,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -9845,34 +9845,34 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10168,7 +10168,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) {
+define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10322,7 +10322,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10834,7 +10834,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) {
+define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11006,7 +11006,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11298,7 +11298,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) {
+define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11452,7 +11452,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11964,7 +11964,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) {
+define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12082,7 +12082,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12546,7 +12546,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) {
+define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13196,7 +13196,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13204,10 +13204,10 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -13219,7 +13219,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s5, 16
@@ -13433,6 +13433,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v36
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -13445,10 +13446,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14268,7 +14268,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
+define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15315,7 +15315,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15323,29 +15323,29 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -15356,7 +15356,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -15580,6 +15580,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -15604,30 +15605,29 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15643,33 +15643,33 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -15680,7 +15680,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -15904,6 +15904,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -15928,34 +15929,33 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16253,7 +16253,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
+define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16903,7 +16903,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16911,10 +16911,10 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -16926,7 +16926,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s5, 16
@@ -17140,6 +17140,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v36
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -17152,10 +17153,9 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17975,7 +17975,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
+define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19134,7 +19134,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19142,29 +19142,29 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -19175,7 +19175,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -19508,30 +19508,30 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19544,33 +19544,33 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -19581,7 +19581,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -19819,34 +19819,34 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20142,7 +20142,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) {
+define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20314,7 +20314,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20605,7 +20605,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) {
+define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20723,7 +20723,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21187,7 +21187,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) {
+define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21873,7 +21873,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21881,10 +21881,10 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -21896,7 +21896,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -22046,6 +22046,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -22070,10 +22071,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22661,7 +22661,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
+define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23708,7 +23708,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23716,29 +23716,29 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -23749,7 +23749,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -23973,6 +23973,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -23997,30 +23998,29 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24036,33 +24036,33 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -24073,7 +24073,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -24297,6 +24297,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -24321,34 +24322,33 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24646,7 +24646,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
+define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25332,7 +25332,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25340,10 +25340,10 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -25355,7 +25355,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -25505,6 +25505,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -25529,10 +25530,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26120,7 +26120,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
+define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27279,7 +27279,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27287,29 +27287,29 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -27320,7 +27320,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -27653,30 +27653,30 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27689,33 +27689,33 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -27726,7 +27726,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -27964,34 +27964,34 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28287,7 +28287,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) {
+define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28901,7 +28901,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28909,10 +28909,10 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s12, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -28924,7 +28924,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s5, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s13, 16
@@ -29126,6 +29126,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v37
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -29138,10 +29139,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29913,7 +29913,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
+define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30960,7 +30960,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30968,29 +30968,29 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -31001,7 +31001,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -31225,6 +31225,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -31249,30 +31250,29 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31288,33 +31288,33 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -31325,7 +31325,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -31549,6 +31549,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -31573,34 +31574,33 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31898,7 +31898,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
+define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32512,7 +32512,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32520,10 +32520,10 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s12, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -32535,7 +32535,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s5, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s13, 16
@@ -32737,6 +32737,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v37
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -32749,10 +32750,9 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33524,7 +33524,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
+define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34683,7 +34683,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34691,29 +34691,29 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -34724,7 +34724,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -35057,30 +35057,30 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35093,33 +35093,33 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -35130,7 +35130,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -35368,34 +35368,34 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35691,7 +35691,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
+define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36786,7 +36786,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36794,42 +36794,40 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
-; SI-NEXT:    v_writelane_b32 v24, s80, 24
-; SI-NEXT:    v_writelane_b32 v24, s81, 25
-; SI-NEXT:    v_writelane_b32 v24, s82, 26
-; SI-NEXT:    v_writelane_b32 v24, s83, 27
-; SI-NEXT:    v_writelane_b32 v24, s84, 28
-; SI-NEXT:    v_writelane_b32 v24, s85, 29
-; SI-NEXT:    v_writelane_b32 v24, s86, 30
-; SI-NEXT:    v_writelane_b32 v24, s87, 31
-; SI-NEXT:    v_writelane_b32 v24, s96, 32
-; SI-NEXT:    v_writelane_b32 v24, s97, 33
-; SI-NEXT:    v_writelane_b32 v24, s98, 34
-; SI-NEXT:    v_writelane_b32 v24, s99, 35
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s80, 22
+; SI-NEXT:    v_writelane_b32 v24, s81, 23
+; SI-NEXT:    v_writelane_b32 v24, s82, 24
+; SI-NEXT:    v_writelane_b32 v24, s83, 25
+; SI-NEXT:    v_writelane_b32 v24, s84, 26
+; SI-NEXT:    v_writelane_b32 v24, s85, 27
+; SI-NEXT:    v_writelane_b32 v24, s86, 28
+; SI-NEXT:    v_writelane_b32 v24, s87, 29
+; SI-NEXT:    v_writelane_b32 v24, s96, 30
+; SI-NEXT:    v_writelane_b32 v24, s97, 31
+; SI-NEXT:    v_writelane_b32 v24, s98, 32
+; SI-NEXT:    v_writelane_b32 v24, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s99, v9
 ; SI-NEXT:    v_readfirstlane_b32 s65, v8
 ; SI-NEXT:    v_readfirstlane_b32 s96, v7
@@ -36865,7 +36863,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s51, s71, 16
 ; SI-NEXT:    s_lshr_b32 s80, s81, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    v_writelane_b32 v24, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    v_writelane_b32 v24, s31, 35
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s5, s17, 0xffff
@@ -37187,6 +37187,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s27, s47, 0xffff
 ; SI-NEXT:    s_lshl_b32 s28, s55, 16
 ; SI-NEXT:    s_or_b32 s27, s27, s28
+; SI-NEXT:    v_readlane_b32 s30, v24, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -37211,42 +37212,41 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s25
 ; SI-NEXT:    v_mov_b32_e32 v22, s26
 ; SI-NEXT:    v_mov_b32_e32 v23, s27
-; SI-NEXT:    v_readlane_b32 s99, v24, 35
-; SI-NEXT:    v_readlane_b32 s98, v24, 34
-; SI-NEXT:    v_readlane_b32 s97, v24, 33
-; SI-NEXT:    v_readlane_b32 s96, v24, 32
-; SI-NEXT:    v_readlane_b32 s87, v24, 31
-; SI-NEXT:    v_readlane_b32 s86, v24, 30
-; SI-NEXT:    v_readlane_b32 s85, v24, 29
-; SI-NEXT:    v_readlane_b32 s84, v24, 28
-; SI-NEXT:    v_readlane_b32 s83, v24, 27
-; SI-NEXT:    v_readlane_b32 s82, v24, 26
-; SI-NEXT:    v_readlane_b32 s81, v24, 25
-; SI-NEXT:    v_readlane_b32 s80, v24, 24
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 35
+; SI-NEXT:    v_readlane_b32 s99, v24, 33
+; SI-NEXT:    v_readlane_b32 s98, v24, 32
+; SI-NEXT:    v_readlane_b32 s97, v24, 31
+; SI-NEXT:    v_readlane_b32 s96, v24, 30
+; SI-NEXT:    v_readlane_b32 s87, v24, 29
+; SI-NEXT:    v_readlane_b32 s86, v24, 28
+; SI-NEXT:    v_readlane_b32 s85, v24, 27
+; SI-NEXT:    v_readlane_b32 s84, v24, 26
+; SI-NEXT:    v_readlane_b32 s83, v24, 25
+; SI-NEXT:    v_readlane_b32 s82, v24, 24
+; SI-NEXT:    v_readlane_b32 s81, v24, 23
+; SI-NEXT:    v_readlane_b32 s80, v24, 22
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38067,7 +38067,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
+define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38843,7 +38843,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40069,3 +40069,5 @@ end:
   %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <48 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index b52e8a54540dc..272038cfc4881 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) {
+define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -180,7 +180,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -495,7 +495,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) {
+define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -656,7 +656,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1184,7 +1184,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) {
+define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1358,7 +1358,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1673,7 +1673,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) {
+define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1854,7 +1854,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2169,7 +2169,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) {
+define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2343,7 +2343,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2658,7 +2658,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) {
+define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2780,7 +2780,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3256,7 +3256,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
+define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4014,7 +4014,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4022,15 +4022,15 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -4044,7 +4044,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -4206,6 +4206,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4232,15 +4233,14 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4875,7 +4875,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
+define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6024,7 +6024,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6032,33 +6032,34 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -6071,7 +6072,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -6313,6 +6313,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6339,34 +6340,33 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6382,37 +6382,38 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -6425,7 +6426,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -6667,6 +6667,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6693,38 +6694,37 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7038,7 +7038,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
+define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7796,7 +7796,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7804,15 +7804,15 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -7826,7 +7826,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -7988,6 +7988,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -8014,15 +8015,14 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8657,7 +8657,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
+define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9927,7 +9927,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9935,33 +9935,33 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -9974,7 +9974,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -10331,34 +10331,34 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10371,37 +10371,38 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -10414,7 +10415,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -10668,38 +10668,38 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11011,7 +11011,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) {
+define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11172,7 +11172,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11700,7 +11700,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) {
+define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11881,7 +11881,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12196,7 +12196,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) {
+define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12357,7 +12357,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12885,7 +12885,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) {
+define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13007,7 +13007,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13483,7 +13483,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
+define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14215,7 +14215,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14223,15 +14223,15 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -14250,7 +14250,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 16
@@ -14492,6 +14492,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -14505,15 +14506,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15417,7 +15417,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
+define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16566,7 +16566,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16574,33 +16574,34 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -16613,7 +16614,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -16855,6 +16855,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -16881,34 +16882,33 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16924,37 +16924,38 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -16967,7 +16968,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -17209,6 +17209,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -17235,38 +17236,37 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17580,7 +17580,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
+define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18312,7 +18312,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18320,15 +18320,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -18347,7 +18347,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 16
@@ -18589,6 +18589,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -18602,15 +18603,14 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19514,7 +19514,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
+define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20784,7 +20784,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20792,33 +20792,33 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -20831,7 +20831,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -21188,34 +21188,34 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21228,37 +21228,38 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -21271,7 +21272,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -21525,38 +21525,38 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21868,7 +21868,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) {
+define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22049,7 +22049,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22363,7 +22363,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) {
+define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22485,7 +22485,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22961,7 +22961,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
+define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23733,7 +23733,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23741,15 +23741,15 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -23763,7 +23763,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -23925,6 +23925,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -23951,15 +23952,14 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24594,7 +24594,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
+define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25743,7 +25743,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25751,33 +25751,34 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -25790,7 +25791,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -26032,6 +26032,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26058,34 +26059,33 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26101,37 +26101,38 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -26144,7 +26145,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -26386,6 +26386,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26412,38 +26413,37 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26757,7 +26757,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
+define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27529,7 +27529,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27537,15 +27537,15 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -27559,7 +27559,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -27721,6 +27721,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -27747,15 +27748,14 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28390,7 +28390,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
+define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29660,7 +29660,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29668,33 +29668,33 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -29707,7 +29707,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -30064,34 +30064,34 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30104,37 +30104,38 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -30147,7 +30148,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -30401,38 +30401,38 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30744,7 +30744,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
+define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31437,7 +31437,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31445,15 +31445,15 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s13, v11
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v9
@@ -31472,7 +31472,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s13, 16
@@ -31705,6 +31705,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -31718,15 +31719,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32578,7 +32578,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
+define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33727,7 +33727,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33735,33 +33735,34 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -33774,7 +33775,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -34016,6 +34016,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -34042,34 +34043,33 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34085,37 +34085,38 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -34128,7 +34129,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -34370,6 +34370,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -34396,38 +34397,37 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34741,7 +34741,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
+define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35434,7 +35434,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35442,15 +35442,15 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s13, v11
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v9
@@ -35469,7 +35469,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s13, 16
@@ -35702,6 +35702,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -35715,15 +35716,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -36575,7 +36575,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
+define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37845,7 +37845,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37853,33 +37853,33 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -37892,7 +37892,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -38249,34 +38249,34 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38289,37 +38289,38 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -38332,7 +38333,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -38586,38 +38586,38 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38929,7 +38929,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
+define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40155,7 +40155,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40163,42 +40163,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
-; SI-NEXT:    v_writelane_b32 v26, s84, 28
-; SI-NEXT:    v_writelane_b32 v26, s85, 29
-; SI-NEXT:    v_writelane_b32 v26, s86, 30
-; SI-NEXT:    v_writelane_b32 v26, s87, 31
-; SI-NEXT:    v_writelane_b32 v26, s96, 32
-; SI-NEXT:    v_writelane_b32 v26, s97, 33
-; SI-NEXT:    v_writelane_b32 v26, s98, 34
-; SI-NEXT:    v_writelane_b32 v26, s99, 35
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s84, 26
+; SI-NEXT:    v_writelane_b32 v26, s85, 27
+; SI-NEXT:    v_writelane_b32 v26, s86, 28
+; SI-NEXT:    v_writelane_b32 v26, s87, 29
+; SI-NEXT:    v_writelane_b32 v26, s96, 30
+; SI-NEXT:    v_writelane_b32 v26, s97, 31
+; SI-NEXT:    v_writelane_b32 v26, s98, 32
+; SI-NEXT:    v_writelane_b32 v26, s99, 33
+; SI-NEXT:    v_writelane_b32 v26, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s85, v11
 ; SI-NEXT:    v_readfirstlane_b32 s99, v10
 ; SI-NEXT:    v_readfirstlane_b32 s81, v9
@@ -40211,6 +40210,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s97, v2
 ; SI-NEXT:    v_readfirstlane_b32 s84, v1
 ; SI-NEXT:    v_readfirstlane_b32 s86, v0
+; SI-NEXT:    v_writelane_b32 v26, s31, 35
 ; SI-NEXT:    s_lshr_b32 s54, s29, 16
 ; SI-NEXT:    s_lshr_b32 s91, s28, 16
 ; SI-NEXT:    s_lshr_b32 s53, s27, 16
@@ -40602,6 +40602,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s29, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s40, s68, 16
 ; SI-NEXT:    s_or_b32 s29, s29, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s15
 ; SI-NEXT:    v_mov_b32_e32 v2, s12
@@ -40628,42 +40629,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s27
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
 ; SI-NEXT:    v_mov_b32_e32 v25, s29
-; SI-NEXT:    v_readlane_b32 s99, v26, 35
-; SI-NEXT:    v_readlane_b32 s98, v26, 34
-; SI-NEXT:    v_readlane_b32 s97, v26, 33
-; SI-NEXT:    v_readlane_b32 s96, v26, 32
-; SI-NEXT:    v_readlane_b32 s87, v26, 31
-; SI-NEXT:    v_readlane_b32 s86, v26, 30
-; SI-NEXT:    v_readlane_b32 s85, v26, 29
-; SI-NEXT:    v_readlane_b32 s84, v26, 28
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 35
+; SI-NEXT:    v_readlane_b32 s99, v26, 33
+; SI-NEXT:    v_readlane_b32 s98, v26, 32
+; SI-NEXT:    v_readlane_b32 s97, v26, 31
+; SI-NEXT:    v_readlane_b32 s96, v26, 30
+; SI-NEXT:    v_readlane_b32 s87, v26, 29
+; SI-NEXT:    v_readlane_b32 s86, v26, 28
+; SI-NEXT:    v_readlane_b32 s85, v26, 27
+; SI-NEXT:    v_readlane_b32 s84, v26, 26
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41559,7 +41559,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
+define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42433,7 +42433,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43769,3 +43769,5 @@ end:
   %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <52 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index a24315e66392c..17370fc4b8480 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) {
+define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,7 +188,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -526,7 +526,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) {
+define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,7 +694,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1238,7 +1238,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) {
+define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1420,7 +1420,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,7 +1758,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) {
+define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1947,7 +1947,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2285,7 +2285,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) {
+define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2467,7 +2467,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2805,7 +2805,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) {
+define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2931,7 +2931,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3419,7 +3419,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
+define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4255,7 +4255,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4263,20 +4263,20 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -4292,7 +4292,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -4466,6 +4466,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4494,20 +4495,19 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4550,10 +4550,10 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -4569,7 +4569,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -4743,6 +4743,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -4771,10 +4772,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5204,7 +5204,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
+define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6449,7 +6449,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6457,37 +6457,38 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -6502,7 +6503,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -6762,6 +6762,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6790,38 +6791,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6838,38 +6838,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -6884,6 +6883,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -7143,6 +7143,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7171,38 +7172,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -7620,7 +7620,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
+define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8456,7 +8456,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,20 +8464,20 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -8493,7 +8493,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -8667,6 +8667,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -8695,20 +8696,19 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8751,10 +8751,10 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -8770,7 +8770,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -8944,6 +8944,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -8972,10 +8973,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9405,7 +9405,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
+define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10787,7 +10787,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10795,37 +10795,37 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -10840,7 +10840,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -11221,38 +11221,38 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11266,41 +11266,41 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -11315,7 +11315,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -11602,38 +11602,38 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -11962,7 +11962,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) {
+define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12130,7 +12130,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12674,7 +12674,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) {
+define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12863,7 +12863,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13201,7 +13201,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) {
+define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13369,7 +13369,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13913,7 +13913,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) {
+define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14039,7 +14039,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14527,7 +14527,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
+define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15335,7 +15335,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15343,20 +15343,20 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -15382,7 +15382,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s5, 16
@@ -15650,6 +15650,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -15662,20 +15663,19 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15688,10 +15688,10 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -15715,7 +15715,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -15937,6 +15937,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15957,10 +15958,9 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16675,7 +16675,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
+define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17920,7 +17920,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17928,37 +17928,38 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -17973,7 +17974,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -18233,6 +18233,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -18261,38 +18262,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18309,38 +18309,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -18355,6 +18354,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -18614,6 +18614,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -18642,38 +18643,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -19091,7 +19091,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
+define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19899,7 +19899,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19907,20 +19907,20 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -19946,7 +19946,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s5, 16
@@ -20214,6 +20214,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -20226,20 +20227,19 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20252,10 +20252,10 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -20279,7 +20279,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -20501,6 +20501,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20521,10 +20522,9 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21239,7 +21239,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
+define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22621,7 +22621,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22629,37 +22629,37 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -22674,7 +22674,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -23055,38 +23055,38 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23100,41 +23100,41 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -23149,7 +23149,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -23436,38 +23436,38 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -23796,7 +23796,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) {
+define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23985,7 +23985,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24322,7 +24322,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) {
+define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24448,7 +24448,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24936,7 +24936,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
+define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25786,7 +25786,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25794,20 +25794,20 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -25823,7 +25823,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -25997,6 +25997,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -26025,20 +26026,19 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26081,10 +26081,10 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -26100,7 +26100,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -26274,6 +26274,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -26302,10 +26303,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26735,7 +26735,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
+define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27980,7 +27980,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27988,37 +27988,38 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -28033,7 +28034,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -28293,6 +28293,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28321,38 +28322,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28369,38 +28369,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -28415,6 +28414,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -28674,6 +28674,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28702,38 +28703,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -29151,7 +29151,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
+define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30001,7 +30001,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30009,20 +30009,20 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -30038,7 +30038,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -30212,6 +30212,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -30240,20 +30241,19 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30296,10 +30296,10 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -30315,7 +30315,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -30489,6 +30489,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -30517,10 +30518,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30950,7 +30950,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
+define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32332,7 +32332,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32340,37 +32340,37 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -32385,7 +32385,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -32766,38 +32766,38 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32811,41 +32811,41 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -32860,7 +32860,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -33147,38 +33147,38 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -33507,7 +33507,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
+define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34273,7 +34273,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34281,20 +34281,20 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
 ; SI-NEXT:    v_readfirstlane_b32 s40, v12
 ; SI-NEXT:    v_readfirstlane_b32 s15, v11
@@ -34320,7 +34320,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s41, 16
@@ -34580,6 +34580,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -34592,20 +34593,19 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34618,10 +34618,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s9, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -34645,7 +34645,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s9, 16
@@ -34862,6 +34862,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v33
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v31
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -34873,10 +34874,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35550,7 +35550,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
+define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36795,7 +36795,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36803,37 +36803,38 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -36848,7 +36849,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -37108,6 +37108,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -37136,38 +37137,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37184,38 +37184,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -37230,6 +37229,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -37489,6 +37489,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -37517,38 +37518,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -37966,7 +37966,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
+define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38732,7 +38732,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38740,20 +38740,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
 ; SI-NEXT:    v_readfirstlane_b32 s40, v12
 ; SI-NEXT:    v_readfirstlane_b32 s15, v11
@@ -38779,7 +38779,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s41, 16
@@ -39039,6 +39039,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -39051,20 +39052,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39077,10 +39077,10 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s9, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -39104,7 +39104,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s9, 16
@@ -39321,6 +39321,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v33
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v31
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -39332,10 +39333,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -40009,7 +40009,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
+define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41391,7 +41391,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41399,37 +41399,37 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -41444,7 +41444,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -41825,38 +41825,38 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41870,41 +41870,41 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -41919,7 +41919,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -42206,38 +42206,38 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -42566,7 +42566,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
+define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43917,7 +43917,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43926,42 +43926,41 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
-; SI-NEXT:    v_writelane_b32 v28, s96, 32
-; SI-NEXT:    v_writelane_b32 v28, s97, 33
-; SI-NEXT:    v_writelane_b32 v28, s98, 34
-; SI-NEXT:    v_writelane_b32 v28, s99, 35
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s96, 30
+; SI-NEXT:    v_writelane_b32 v28, s97, 31
+; SI-NEXT:    v_writelane_b32 v28, s98, 32
+; SI-NEXT:    v_writelane_b32 v28, s99, 33
+; SI-NEXT:    v_writelane_b32 v28, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s54, v13
 ; SI-NEXT:    v_readfirstlane_b32 s55, v12
 ; SI-NEXT:    v_readfirstlane_b32 s52, v11
@@ -43976,6 +43975,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s84, v2
 ; SI-NEXT:    v_readfirstlane_b32 s97, v1
 ; SI-NEXT:    v_readfirstlane_b32 s99, v0
+; SI-NEXT:    v_writelane_b32 v28, s31, 35
 ; SI-NEXT:    s_lshr_b32 s64, s29, 16
 ; SI-NEXT:    s_lshr_b32 s37, s28, 16
 ; SI-NEXT:    s_lshr_b32 s65, s27, 16
@@ -44441,6 +44441,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_lshl_b32 s42, s78, 16
 ; SI-NEXT:    s_or_b32 s11, s11, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -44469,42 +44470,41 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s13
 ; SI-NEXT:    v_mov_b32_e32 v26, s10
 ; SI-NEXT:    v_mov_b32_e32 v27, s11
-; SI-NEXT:    v_readlane_b32 s99, v28, 35
-; SI-NEXT:    v_readlane_b32 s98, v28, 34
-; SI-NEXT:    v_readlane_b32 s97, v28, 33
-; SI-NEXT:    v_readlane_b32 s96, v28, 32
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 35
+; SI-NEXT:    v_readlane_b32 s99, v28, 33
+; SI-NEXT:    v_readlane_b32 s98, v28, 32
+; SI-NEXT:    v_readlane_b32 s97, v28, 31
+; SI-NEXT:    v_readlane_b32 s96, v28, 30
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -44559,9 +44559,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s7, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -44576,7 +44576,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s76, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_lshr_b32 s42, s29, 16
 ; VI-NEXT:    s_lshr_b32 s45, s28, 16
 ; VI-NEXT:    s_lshr_b32 s46, s27, 16
@@ -44752,6 +44752,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    s_or_b32 s10, s10, s11
 ; VI-NEXT:    s_or_b32 s8, s8, s9
 ; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -44780,10 +44781,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s10
 ; VI-NEXT:    v_mov_b32_e32 v26, s8
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45501,7 +45501,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
+define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46459,7 +46459,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46963,9 +46963,10 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v13
 ; VI-NEXT:    v_readfirstlane_b32 s46, v12
 ; VI-NEXT:    v_readfirstlane_b32 s56, v11
@@ -46980,7 +46981,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s90, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
 ; VI-NEXT:    s_lshr_b32 s6, s29, 16
 ; VI-NEXT:    s_lshr_b32 s7, s28, 16
 ; VI-NEXT:    s_lshr_b32 s8, s27, 16
@@ -47185,6 +47185,7 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -47205,10 +47206,9 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47924,3 +47924,5 @@ end:
   %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <56 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index d6cf0367f1b20..8c5d66f1227ea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) {
+define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,7 +196,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -557,7 +557,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) {
+define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -732,7 +732,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1308,7 +1308,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) {
+define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1498,7 +1498,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1859,7 +1859,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) {
+define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2057,7 +2057,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2418,7 +2418,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) {
+define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2608,7 +2608,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2969,7 +2969,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) {
+define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3099,7 +3099,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3615,7 +3615,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
+define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4523,7 +4523,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4531,23 +4531,23 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -4565,7 +4565,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -4751,6 +4751,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4781,23 +4782,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4842,14 +4842,14 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -4867,7 +4867,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -5053,6 +5053,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -5083,14 +5084,13 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5135,10 +5135,10 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -5156,7 +5156,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -5282,6 +5282,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -5312,10 +5313,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5564,7 +5564,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
+define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6909,7 +6909,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6917,41 +6917,42 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -6968,7 +6969,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -7246,6 +7246,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7276,42 +7277,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7328,47 +7328,48 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -7383,7 +7384,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -7666,6 +7666,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7696,38 +7697,37 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -8082,7 +8082,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
+define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8990,7 +8990,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8998,23 +8998,23 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -9032,7 +9032,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -9218,6 +9218,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -9248,23 +9249,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9309,14 +9309,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -9334,7 +9334,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -9520,6 +9520,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -9550,14 +9551,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9602,10 +9602,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -9623,7 +9623,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -9749,6 +9749,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -9779,10 +9780,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10031,7 +10031,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
+define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11520,7 +11520,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11528,41 +11528,42 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -11579,7 +11580,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -11984,42 +11984,42 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -12033,53 +12033,54 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -12092,7 +12093,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -12391,38 +12391,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -12771,7 +12771,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) {
+define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12946,7 +12946,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13522,7 +13522,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) {
+define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13720,7 +13720,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14081,7 +14081,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) {
+define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14256,7 +14256,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14832,7 +14832,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) {
+define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14962,7 +14962,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15478,7 +15478,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
+define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16356,7 +16356,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16364,23 +16364,23 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -16412,7 +16412,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s5, 16
@@ -16704,6 +16704,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -16714,23 +16715,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16743,14 +16743,14 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -16780,7 +16780,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -17024,6 +17024,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -17042,14 +17043,13 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17062,10 +17062,10 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -17095,7 +17095,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB29_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -17339,6 +17339,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -17357,10 +17358,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17834,7 +17834,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
+define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19179,7 +19179,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19187,41 +19187,42 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -19238,7 +19239,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -19516,6 +19516,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -19546,42 +19547,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19598,47 +19598,48 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -19653,7 +19654,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -19936,6 +19936,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -19966,38 +19967,37 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -20352,7 +20352,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
+define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21230,7 +21230,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21238,23 +21238,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -21286,7 +21286,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s5, 16
@@ -21578,6 +21578,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -21588,23 +21589,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21617,14 +21617,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -21654,7 +21654,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -21898,6 +21898,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21916,14 +21917,13 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21936,10 +21936,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -21969,7 +21969,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB33_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -22213,6 +22213,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -22231,10 +22232,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22708,7 +22708,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
+define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24197,7 +24197,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24205,41 +24205,42 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -24256,7 +24257,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -24661,42 +24661,42 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24710,53 +24710,54 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -24769,7 +24770,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -25068,38 +25068,38 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -25448,7 +25448,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) {
+define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25646,7 +25646,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26006,7 +26006,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) {
+define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26136,7 +26136,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26652,7 +26652,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
+define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27576,7 +27576,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27584,23 +27584,23 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -27618,7 +27618,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -27804,6 +27804,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -27834,23 +27835,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27895,14 +27895,14 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -27920,7 +27920,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -28106,6 +28106,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -28136,14 +28137,13 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28188,10 +28188,10 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -28209,7 +28209,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -28335,6 +28335,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -28365,10 +28366,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28617,7 +28617,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
+define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29962,7 +29962,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29970,41 +29970,42 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -30021,7 +30022,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -30299,6 +30299,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -30329,42 +30330,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30381,47 +30381,48 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -30436,7 +30437,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -30719,6 +30719,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -30749,38 +30750,37 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -31135,7 +31135,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
+define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32059,7 +32059,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32067,23 +32067,23 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -32101,7 +32101,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -32287,6 +32287,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -32317,23 +32318,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32378,14 +32378,14 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -32403,7 +32403,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -32589,6 +32589,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -32619,14 +32620,13 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32671,10 +32671,10 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -32692,7 +32692,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -32818,6 +32818,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -32848,10 +32849,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33100,7 +33100,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
+define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34589,7 +34589,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34597,41 +34597,42 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -34648,7 +34649,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -35053,42 +35053,42 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35102,53 +35102,54 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -35161,7 +35162,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -35460,38 +35460,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -35840,7 +35840,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
+define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36673,7 +36673,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36681,23 +36681,23 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s15, v15
 ; SI-NEXT:    v_readfirstlane_b32 s14, v14
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
@@ -36729,7 +36729,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s15, 16
@@ -37012,6 +37012,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -37022,23 +37023,22 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37051,14 +37051,14 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s9, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -37088,7 +37088,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s9, 16
@@ -37325,6 +37325,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v15, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -37335,14 +37336,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37355,10 +37355,10 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s11, v13
@@ -37388,7 +37388,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB49_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s9, 16
@@ -37617,6 +37617,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v54, 16, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v52, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v50, 16, v14
@@ -37635,10 +37636,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v35, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v30, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v33, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38083,7 +38083,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
+define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39428,7 +39428,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39436,41 +39436,42 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -39487,7 +39488,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -39765,6 +39765,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -39795,42 +39796,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39847,47 +39847,48 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -39902,7 +39903,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -40185,6 +40185,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -40215,38 +40216,37 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -40601,7 +40601,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
+define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41434,7 +41434,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41442,23 +41442,23 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s15, v15
 ; SI-NEXT:    v_readfirstlane_b32 s14, v14
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
@@ -41490,7 +41490,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s15, 16
@@ -41773,6 +41773,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -41783,23 +41784,22 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41812,14 +41812,14 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s9, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -41849,7 +41849,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s9, 16
@@ -42086,6 +42086,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v15, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -42096,14 +42097,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -42116,10 +42116,10 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s11, v13
@@ -42149,7 +42149,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB53_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s9, 16
@@ -42378,6 +42378,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v54, 16, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v52, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v50, 16, v14
@@ -42396,10 +42397,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v35, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v30, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v33, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -42844,7 +42844,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
+define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44333,7 +44333,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44341,41 +44341,42 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -44392,7 +44393,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -44797,42 +44797,42 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44846,53 +44846,54 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -44905,7 +44906,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -45204,38 +45204,38 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -45584,7 +45584,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
+define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47057,7 +47057,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47066,54 +47066,53 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
 ; SI-NEXT:    s_lshr_b32 s4, s19, 16
 ; SI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v31, s4, 0
 ; SI-NEXT:    s_lshr_b32 s4, s18, 16
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
 ; SI-NEXT:    v_writelane_b32 v31, s4, 1
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
 ; SI-NEXT:    v_writelane_b32 v31, s17, 2
 ; SI-NEXT:    s_lshr_b32 s4, s17, 16
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
 ; SI-NEXT:    v_writelane_b32 v31, s4, 3
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
 ; SI-NEXT:    v_writelane_b32 v31, s16, 4
 ; SI-NEXT:    s_lshr_b32 s4, s16, 16
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
 ; SI-NEXT:    v_writelane_b32 v31, s4, 5
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s75, v15
 ; SI-NEXT:    v_readfirstlane_b32 s70, v14
 ; SI-NEXT:    v_readfirstlane_b32 s17, v13
@@ -47131,6 +47130,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s97, v1
 ; SI-NEXT:    v_readfirstlane_b32 s99, v0
 ; SI-NEXT:    v_writelane_b32 v31, s18, 6
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    s_mov_b32 s85, s21
 ; SI-NEXT:    s_lshr_b32 s79, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
@@ -47692,6 +47692,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s44, s86, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_readlane_b32 s47, v31, 15
 ; SI-NEXT:    v_readlane_b32 s45, v31, 25
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
@@ -47724,42 +47725,41 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -47821,13 +47821,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -47844,7 +47845,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s89, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v1
 ; VI-NEXT:    v_readfirstlane_b32 s35, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
 ; VI-NEXT:    s_lshr_b32 s44, s29, 16
 ; VI-NEXT:    s_lshr_b32 s47, s28, 16
 ; VI-NEXT:    s_lshr_b32 s58, s27, 16
@@ -48032,6 +48032,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    s_or_b32 s10, s10, s11
 ; VI-NEXT:    s_or_b32 s8, s8, s9
 ; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -48062,14 +48063,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s10
 ; VI-NEXT:    v_mov_b32_e32 v28, s8
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -48084,10 +48084,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    v_readfirstlane_b32 s35, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s31, v13
@@ -48361,6 +48361,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -48379,10 +48380,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -48857,7 +48857,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
+define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49884,7 +49884,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49892,9 +49892,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s30, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 3
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s7, v14
 ; SI-NEXT:    v_readfirstlane_b32 s47, v13
@@ -49911,7 +49912,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s94, v2
 ; SI-NEXT:    v_readfirstlane_b32 s40, v1
 ; SI-NEXT:    v_readfirstlane_b32 s30, v0
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_lshr_b32 s15, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s14, s27, 16
@@ -50420,10 +50420,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
@@ -50466,13 +50466,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    v_readfirstlane_b32 s44, v15
 ; VI-NEXT:    v_readfirstlane_b32 s46, v14
 ; VI-NEXT:    v_readfirstlane_b32 s56, v13
@@ -50489,7 +50490,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s34, v2
 ; VI-NEXT:    v_readfirstlane_b32 s36, v1
 ; VI-NEXT:    v_readfirstlane_b32 s38, v0
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
 ; VI-NEXT:    s_lshr_b32 s6, s29, 16
 ; VI-NEXT:    s_lshr_b32 s7, s28, 16
 ; VI-NEXT:    s_lshr_b32 s8, s27, 16
@@ -50718,6 +50718,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -50736,14 +50737,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -50756,10 +50756,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    v_readfirstlane_b32 s35, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s31, v13
@@ -51034,6 +51034,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -51052,10 +51053,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -51529,3 +51529,5 @@ end:
   %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <60 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 687bea385a266..18eeff9df6180 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) {
+define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -84,7 +84,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -179,7 +179,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) {
+define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,7 +256,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -354,7 +354,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) {
+define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,7 +858,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
+define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1284,7 +1284,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,7 +1684,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1806,7 +1806,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
+define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,7 +2153,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2535,7 +2535,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) {
+define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2746,7 +2746,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) {
+define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2884,7 +2884,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3030,7 +3030,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) {
+define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3128,7 +3128,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3241,7 +3241,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) {
+define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3366,7 +3366,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3499,7 +3499,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) {
+define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3768,7 +3768,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4021,7 +4021,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
+define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4447,7 +4447,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4734,7 +4734,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4846,7 +4846,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4980,7 +4980,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
+define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5327,7 +5327,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5709,7 +5709,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) {
+define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5806,7 +5806,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5928,7 +5928,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) {
+define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6066,7 +6066,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6212,7 +6212,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) {
+define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6309,7 +6309,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6431,7 +6431,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) {
+define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6556,7 +6556,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6689,7 +6689,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7134,7 +7134,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7438,7 +7438,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
+define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7958,7 +7958,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8519,7 +8519,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
+define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8962,7 +8962,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9269,7 +9269,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) {
+define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9586,7 +9586,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9882,7 +9882,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
+define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10325,7 +10325,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10632,7 +10632,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
+define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10942,7 +10942,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11219,7 +11219,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
+define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11580,7 +11580,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11978,7 +11978,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12122,7 +12122,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12288,7 +12288,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
+define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12641,7 +12641,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13018,7 +13018,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13149,7 +13149,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13300,7 +13300,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) {
+define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13422,7 +13422,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13574,7 +13574,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) {
+define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13707,7 +13707,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13856,3 +13856,5 @@ end:
   %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <6 x half> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
index 2b48cf0f41c88..7e9f825e298c7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
-define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bitcast_i8ptr_v16i8ptr:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -66,3 +66,5 @@ entry:
   store <16 x i8> %0, ptr addrspace(1) %out
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index cdd569fa17236..6a2b3e2c51f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -587,32 +587,32 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_realign_stack:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_add_i32 s33, s32, 31
+; GISEL-GFX11-NEXT:    s_mov_b32 s34, s32
+; GISEL-GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
 ; GISEL-GFX11-NEXT:    s_mov_b32 s7, 4
 ; GISEL-GFX11-NEXT:    s_mov_b32 s6, 3
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, 2
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, 1
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 4, v8
-; GISEL-GFX11-NEXT:    s_add_i32 s33, s32, 31
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v4, s7
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
-; GISEL-GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_add_nc_u32 v5, s33, v0
+; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v5, s33, v0
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, s33 ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v9 :: v_dual_add_nc_u32 v5, 32, v5
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11
-; GISEL-GFX11-NEXT:    s_mov_b32 s34, s32
+; GISEL-GFX11-NEXT:    s_addk_i32 s32, 0xe0
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11
 ; GISEL-GFX11-NEXT:    scratch_store_b128 v5, v[1:4], off dlc
 ; GISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, s33 ; 4-byte Folded Reload
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_addk_i32 s32, 0xe0
 ; GISEL-GFX11-NEXT:    s_mov_b32 s32, s34
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
@@ -621,26 +621,25 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
-; GISEL-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s34, s32
 ; GISEL-GFX10-NEXT:    s_and_b32 s33, s33, 0xfffffc00
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; GISEL-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX10-NEXT:    v_lshrrev_b32_e64 v3, 5, s33
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v5, 4
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
-; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v2, v0, v3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 2
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v9
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v10
+; GISEL-GFX10-NEXT:    s_addk_i32 s32, 0x1c00
 ; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v11
-; GISEL-GFX10-NEXT:    s_mov_b32 s34, s32
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v9
 ; GISEL-GFX10-NEXT:    buffer_store_dword v1, v2, s[48:51], 0 offen
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v3, v2, s[48:51], 0 offen offset:4
@@ -650,8 +649,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; GISEL-GFX10-NEXT:    buffer_store_dword v5, v2, s[48:51], 0 offen offset:12
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], s33 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v10
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v11
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    s_addk_i32 s32, 0x1c00
 ; GISEL-GFX10-NEXT:    s_mov_b32 s32, s34
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
@@ -672,13 +672,13 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    scratch_store_b128 v5, v[1:4], off dlc
 ; DAGISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, s33 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v10, v11
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, s34
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
@@ -688,26 +688,25 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 4
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s34, s32
 ; DAGISEL-GFX10-NEXT:    s_and_b32 s33, s33, 0xfffffc00
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; DAGISEL-GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 4
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; DAGISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s34, s32
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_addk_i32 s32, 0x1c00
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, v0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v9
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v10
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v11
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v2, v1, s[48:51], 0 offen offset:12
 ; DAGISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v3, v1, s[48:51], 0 offen offset:8
@@ -717,6 +716,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v5, v1, s[48:51], 0 offen
 ; DAGISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], s33 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v11
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s32, s34
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index afe0971088bc1..ff31915e9080f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -119,32 +119,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -178,21 +178,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -259,30 +259,30 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v2
@@ -314,20 +314,20 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -401,32 +401,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -460,21 +460,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -543,30 +543,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v42, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v42, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v42, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v42, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v42, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v42, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v42, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -597,20 +597,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v42, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v42, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v42, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v42, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v42, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v42, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -684,33 +684,33 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 15
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s54, 12
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 13
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 14
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -744,21 +744,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s54, v43, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 15
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -771,3 +771,4 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
+
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 554f40b5bfdfa..7ea641885a1f1 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX8-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX8-ARCH-FLAT-NEXT:    s_add_i32 s3, s33, 8
@@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX9-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX9-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX942-ARCH-FLAT-NEXT:    s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
 ; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT:    s_nop 1
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX942-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7662c844a835d..810a57515e715 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3315,8 +3315,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3342,10 +3342,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3371,10 +3371,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3400,10 +3400,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3426,13 +3426,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_short v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3459,10 +3460,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3489,10 +3490,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3519,10 +3521,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b16 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -3559,8 +3562,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3586,10 +3589,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3615,10 +3618,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3644,10 +3647,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3670,13 +3673,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_dword v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3703,10 +3707,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3733,10 +3737,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3763,10 +3768,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b32 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -3807,8 +3813,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3836,12 +3842,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v2
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3868,12 +3874,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3899,12 +3905,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3927,16 +3933,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_short v4, v1, off offset:4 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dword v4, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -3963,12 +3970,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3995,12 +4002,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4028,13 +4036,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 v4, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4082,8 +4091,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v3, v7, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v8, 0
+; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4120,10 +4129,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v2
+; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4150,12 +4159,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4181,12 +4190,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4209,14 +4218,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4243,12 +4253,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4275,10 +4285,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4306,10 +4317,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4371,8 +4383,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v5, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v16, 0
+; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4423,10 +4435,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v4
+; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4459,12 +4471,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4490,6 +4502,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4499,7 +4512,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4522,13 +4534,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4555,6 +4568,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4564,7 +4578,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4591,10 +4604,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
@@ -4621,10 +4635,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4714,8 +4729,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v9, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v20, 0
+; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4794,10 +4809,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v8
+; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4842,12 +4857,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4873,6 +4888,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4890,7 +4906,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4913,15 +4928,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
@@ -4948,6 +4964,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4965,7 +4982,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4992,12 +5008,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
@@ -5024,13 +5041,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -28593,822 +28611,6 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_rsq_bf16(bfloat %x) {
-; GCN-LABEL: v_rsq_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x260
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_sqrt_f32_e32 v2, v0
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; GCN-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; GCN-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GCN-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GCN-NEXT:    v_rcp_f32_e32 v2, v1
-; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
-; GCN-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GCN-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GCN-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GCN-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_rsq_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX7-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX7-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; GFX7-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX7-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; GFX7-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX7-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX7-NEXT:    v_fma_f32 v2, v3, v2, v2
-; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_rsq_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], -1, v1
-; GFX8-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], 1, v1
-; GFX8-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
-; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_rsq_bf16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX900-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX900-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX900-NEXT:    v_add_u32_e32 v2, -1, v1
-; GFX900-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX900-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX900-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX900-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX900-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX900-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX900-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX900-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
-; GFX900-NEXT:    v_rcp_f32_e32 v3, v1
-; GFX900-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX900-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX900-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX900-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX900-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX900-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; GFX900-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX900-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_rsq_bf16:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    s_mov_b32 s0, 0xf800000
-; GFX950-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_add_u32_e32 v2, -1, v1
-; GFX950-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX950-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v3
-; GFX950-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX950-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GFX950-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX950-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX950-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX950-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX950-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX950-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX950-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX950-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX950-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX950-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX950-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_rsq_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX10-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX10-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 1.0
-; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX10-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11TRUE16-LABEL: v_rsq_bf16:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX11TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX11TRUE16-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX11TRUE16-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX11TRUE16-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX11TRUE16-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_rsq_bf16:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX11FAKE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX11FAKE16-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX11FAKE16-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
-; GFX11FAKE16-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX11FAKE16-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11FAKE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX11FAKE16-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_rsq_bf16:
-; GFX1250TRUE16:       ; %bb.0:
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_rsq_bf16_e32 v0.l, v0.l
-; GFX1250TRUE16-NEXT:    s_set_pc_i64 s[30:31]
-;
-; GFX1250FAKE16-LABEL: v_rsq_bf16:
-; GFX1250FAKE16:       ; %bb.0:
-; GFX1250FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT:    v_rsq_bf16_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_set_pc_i64 s[30:31]
-  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
-  %rsq = fdiv contract bfloat 1.0, %sqrt
-  ret bfloat %rsq
-}
-
-define bfloat @v_neg_rsq_bf16(bfloat %x) {
-; GCN-LABEL: v_neg_rsq_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x260
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_sqrt_f32_e32 v2, v0
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; GCN-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; GCN-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GCN-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GCN-NEXT:    v_rcp_f32_e32 v2, v1
-; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
-; GCN-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GCN-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GCN-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GCN-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_neg_rsq_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX7-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX7-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; GFX7-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX7-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; GFX7-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX7-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX7-NEXT:    v_fma_f32 v2, v3, v2, v2
-; GFX7-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_neg_rsq_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], -1, v1
-; GFX8-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], 1, v1
-; GFX8-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
-; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_neg_rsq_bf16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX900-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX900-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX900-NEXT:    v_add_u32_e32 v2, -1, v1
-; GFX900-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX900-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX900-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX900-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX900-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX900-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX900-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX900-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
-; GFX900-NEXT:    v_rcp_f32_e32 v3, v1
-; GFX900-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX900-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX900-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX900-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX900-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX900-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; GFX900-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX900-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_neg_rsq_bf16:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    s_mov_b32 s0, 0xf800000
-; GFX950-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_add_u32_e32 v2, -1, v1
-; GFX950-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX950-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v3
-; GFX950-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX950-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GFX950-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX950-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
-; GFX950-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX950-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX950-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX950-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX950-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX950-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX950-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX950-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX950-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_neg_rsq_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX10-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX10-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, -1.0
-; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX10-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX10-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11TRUE16-LABEL: v_neg_rsq_bf16:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX11TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX11TRUE16-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX11TRUE16-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v1, null, v0, v0, -1.0
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX11TRUE16-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX11TRUE16-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_neg_rsq_bf16:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX11FAKE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v2, -1, v1
-; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX11FAKE16-NEXT:    v_fma_f32 v4, -v2, v1, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_fma_f32 v5, -v3, v1, v0
-; GFX11FAKE16-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_div_scale_f32 v1, null, v0, v0, -1.0
-; GFX11FAKE16-NEXT:    v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX11FAKE16-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11FAKE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11FAKE16-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX11FAKE16-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_neg_rsq_bf16:
-; GFX1250TRUE16:       ; %bb.0:
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_rsq_bf16_e32 v0.l, v0.l
-; GFX1250TRUE16-NEXT:    v_nop
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1250TRUE16-NEXT:    s_set_pc_i64 s[30:31]
-;
-; GFX1250FAKE16-LABEL: v_neg_rsq_bf16:
-; GFX1250FAKE16:       ; %bb.0:
-; GFX1250FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT:    v_rsq_bf16_e32 v0, v0
-; GFX1250FAKE16-NEXT:    v_nop
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX1250FAKE16-NEXT:    s_set_pc_i64 s[30:31]
-  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
-  %rsq = fdiv contract bfloat -1.0, %sqrt
-  ret bfloat %rsq
-}
-
 declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
 
 define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
@@ -45471,18 +44673,18 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v27
+; GFX7-NEXT:    v_writelane_b32 v33, s34, 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v28
+; GFX7-NEXT:    v_writelane_b32 v33, s35, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[92:93], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX7-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v33, s30, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX7-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v33, s31, 3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    v_writelane_b32 v33, s34, 2
-; GFX7-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
@@ -45552,6 +44754,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_readlane_b32 s30, v33, 2
 ; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v3
 ; GFX7-NEXT:    v_bfi_b32 v1, s4, v2, v5
 ; GFX7-NEXT:    v_bfi_b32 v2, s4, v4, v7
@@ -45568,10 +44771,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_bfi_b32 v13, s4, v26, v29
 ; GFX7-NEXT:    v_bfi_b32 v14, s4, v28, v32
 ; GFX7-NEXT:    v_bfi_b32 v15, s4, v31, v30
-; GFX7-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v33, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v33, 1
+; GFX7-NEXT:    v_readlane_b32 s34, v33, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45627,34 +44829,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
-; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
@@ -45780,6 +44982,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT:    v_readlane_b32 s30, v34, 6
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -45788,14 +44991,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
-; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
-; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
-; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
-; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v34, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v34, 5
+; GFX8-NEXT:    v_readlane_b32 s38, v34, 4
+; GFX8-NEXT:    v_readlane_b32 s37, v34, 3
+; GFX8-NEXT:    v_readlane_b32 s36, v34, 2
+; GFX8-NEXT:    v_readlane_b32 s35, v34, 1
+; GFX8-NEXT:    v_readlane_b32 s34, v34, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45867,11 +45069,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX900-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT:    v_writelane_b32 v33, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v33, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v33, s34, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s34, 0
+; GFX900-NEXT:    v_writelane_b32 v33, s35, 1
+; GFX900-NEXT:    v_writelane_b32 v33, s30, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s31, 3
 ; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX900-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -45976,6 +45178,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_readlane_b32 s30, v33, 2
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v2, v5, s4
 ; GFX900-NEXT:    v_perm_b32 v2, v4, v7, s4
@@ -45992,10 +45195,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_perm_b32 v13, v26, v29, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v28, v32, s4
 ; GFX900-NEXT:    v_perm_b32 v15, v31, v30, s4
-; GFX900-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX900-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v33, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v33, 1
+; GFX900-NEXT:    v_readlane_b32 s34, v33, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index ab2ad19d0f1bf..fb11d3b7d9d65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s33, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s34, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s35, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s36, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s37, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s38, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s39, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s96, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s97, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s98, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s99, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s33, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s34, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s35, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s36, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s37, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s38, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s39, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s97, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s98, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s99, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s100, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s101, 36
 ; CHECK-NEXT:    s_mov_b32 s40, s12
-; CHECK-NEXT:    v_writelane_b32 v0, s100, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s30, 37
 ; CHECK-NEXT:    s_cmp_eq_u32 s40, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s101, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s31, 38
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s31
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s30, v0, 37
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s32
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use vcc_hi
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s101, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s100, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s99, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s98, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s97, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s96, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s39, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s38, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s37, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s36, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s35, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s34, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s33, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s101, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s100, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s99, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s98, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s97, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s39, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s38, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s37, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s36, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s35, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s34, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s33, v0, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 6ccdbdea135bc..ec54a359ae7c2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -25,8 +25,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -53,8 +53,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -84,8 +84,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
index 36225edabecae..d5b899727441c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
@@ -24,8 +24,8 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s42
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -59,8 +59,8 @@ define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s43
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -94,8 +94,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s43
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index c1b3278144d59..5f59d780c062d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -48,8 +48,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -107,8 +107,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -135,8 +135,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -166,8 +166,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -194,8 +194,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -225,8 +225,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -253,8 +253,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -284,8 +284,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -312,8 +312,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -343,8 +343,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -402,8 +402,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -430,8 +430,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -461,8 +461,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -489,8 +489,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -520,8 +520,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -548,8 +548,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -579,8 +579,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -607,8 +607,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -638,8 +638,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -666,8 +666,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -697,8 +697,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -725,8 +725,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -756,8 +756,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -784,8 +784,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -815,8 +815,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -843,8 +843,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -874,8 +874,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -902,8 +902,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -961,8 +961,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -992,8 +992,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1020,8 +1020,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1079,8 +1079,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1110,8 +1110,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1138,8 +1138,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1169,8 +1169,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1197,8 +1197,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1228,8 +1228,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1256,8 +1256,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[26:27]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1289,8 +1289,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1317,8 +1317,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 293e24f2d1b9d..60bad0d70ec24 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7175,8 +7175,8 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 10
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7243,8 +7243,8 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 10
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7311,8 +7311,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7361,8 +7361,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7429,8 +7429,8 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 10
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7514,8 +7514,8 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7582,8 +7582,8 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7650,8 +7650,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7704,8 +7704,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7772,8 +7772,8 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7865,8 +7865,8 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 6
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7941,8 +7941,8 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 6
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8017,8 +8017,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8072,8 +8072,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8148,8 +8148,8 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 6
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8237,8 +8237,8 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8313,8 +8313,8 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8389,8 +8389,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8447,8 +8447,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8523,8 +8523,8 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index b250227735bd3..26727e53d990c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -25,8 +25,8 @@ define void @use_vcc() #1 {
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s4, v40, 2
 ; GCN: s_mov_b32 s33, s4
 ; GCN: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index aed1079158154..f9070339093da 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
@@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_mov_b32 s34, s40
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 1
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s34
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_mov_b32 s34, s40
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 1
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s34
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v41, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v41, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v41, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v41, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v41, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v41, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
@@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    ; use v40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    v_readlane_b32 s34, v41, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v41, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v41, 1
+; MUBUF-NEXT:    v_readlane_b32 s31, v41, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v41, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v41, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    ; use v40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 1
+; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v41, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index ada817b402444..a2b487580aefb 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -34,8 +34,8 @@ define void @if_call(i32 %flag) #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GCN-NEXT:  .LBB1_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v1, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 0
+; GCN-NEXT:    v_readlane_b32 s31, v1, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 7a5132c993604..c65019fb1a401 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 {
 ; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 3
-; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 5
-; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 6
-; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 7
-; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 9
-; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 10
-; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 11
-; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 13
-; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 14
-; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 15
-; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 17
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[16:31]
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 16
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[72:79]
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[0:15]
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 17
-; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 16
-; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 15
-; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 14
-; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 13
-; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 12
-; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 11
-; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 10
-; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 9
-; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 8
-; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 7
-; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 6
-; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 5
-; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 4
-; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved initial VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved initial VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
@@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
@@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 5f965ba431ab5..bb5963244da3c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bb2f06bfe83f8..f20be656f3af0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 9
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
index 7a241e0e4dff0..6d3a6dc43544b 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-entry.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -51,8 +51,8 @@ define void @caller() {
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index 86486e56d46ac..4144fafa15684 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -153,8 +153,8 @@ define i32 @caller_passes_42() {
 ; SDAG-NEXT:    v_writelane_b32 v18, s31, 1
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
 ; SDAG-NEXT:    v_readlane_b32 s30, v18, 0
+; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
 ; SDAG-NEXT:    s_mov_b32 s32, s33
 ; SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SDAG-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -212,8 +212,8 @@ define i32 @caller_passes_42() {
 ; GISEL-NEXT:    v_writelane_b32 v18, s31, 1
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v18, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..5b37c93ac0bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -378,29 +378,29 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_writelane_b32 v43, s16, 14
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v43, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v43, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v43, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v43, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v43, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v43, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v43, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v43, s50, 10
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v43, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v43, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v43, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v43, s52, 10
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v43, s51, 11
+; GFX9-NEXT:    v_writelane_b32 v43, s53, 11
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v1
-; GFX9-NEXT:    v_writelane_b32 v43, s52, 12
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 12
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; GFX9-NEXT:    s_getpc_b64 s[16:17]
 ; GFX9-NEXT:    s_add_u32 s16, s16, _Z4log2d at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4log2d at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 13
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v2
 ; GFX9-NEXT:    s_mov_b32 s50, s15
@@ -432,21 +432,21 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 12
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_readlane_b32 s53, v43, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v43, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v43, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v43, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v43, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v43, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v43, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v43, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v43, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v43, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 13
+; GFX9-NEXT:    v_readlane_b32 s53, v43, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v43, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v43, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v43, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v43, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v43, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v43, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 14
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 45836ff81f774..d2c1bd151230e 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 555b24a00f2cf..c6ac9837b8633 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX900-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX900-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-DIS-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-DIS-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-DIS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-DIS-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-EN-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-EN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-EN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-EN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 {
 ; WAVE32-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; WAVE32-NEXT:    v_writelane_b32 v40, s31, 1
 ; WAVE32-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    v_readlane_b32 s30, v40, 0
+; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-NEXT:    v_readlane_b32 s4, v40, 2
 ; WAVE32-NEXT:    s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index a0c25b2a0beb3..705d403764503 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_writelane_b32 v41, s16, 16
 ; CHECK-NEXT:    .cfi_llvm_vector_registers 65, 2601, 16, 32
 ; CHECK-NEXT:    .cfi_def_cfa_register 65
-; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 10
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
-; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 12
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
@@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v41, s55, 15
+; CHECK-NEXT:    v_writelane_b32 v41, s55, 13
 ; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 14
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 15
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 14
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s55, v41, 15
-; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index d224cfe27b226..fbacc61492674 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
index 1269b2d0f7367..2d620a14da405 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
@@ -24,34 +24,34 @@ define void @wobble() #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, 0
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; CHECK-NEXT:    v_mov_b32_e32 v42, s9
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_lshr_b32 s5, s33, 5
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_mov_b32 s50, s15
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
 ; CHECK-NEXT:    s_mov_b32 s51, s14
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    s_mov_b32 s52, s13
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    s_mov_b32 s53, s12
-; CHECK-NEXT:    v_writelane_b32 v43, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s54, 12
 ; CHECK-NEXT:    s_add_i32 s54, s5, 16
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 14
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_1: ; %bb1
@@ -91,21 +91,21 @@ define void @wobble() #0 {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8
-; CHECK-NEXT:    v_readlane_b32 s54, v43, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 15
 ; CHECK-NEXT:    s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 76a2114a000cf..cba5aa8ef3672 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v42, 3
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 2e88da142bb41..6abe5998d6767 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s32, s33
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 2
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; NO-SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 8ad19c682ac23..3293c324cf507 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2065,8 +2065,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2095,8 +2095,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2484,8 +2484,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2526,8 +2526,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 9d137fb4101e4..a2f203a111e18 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_writelane_b32 v40, s27, 23
 ; SDAG-NEXT:    v_writelane_b32 v40, s28, 24
 ; SDAG-NEXT:    v_writelane_b32 v40, s29, 25
-; SDAG-NEXT:    v_writelane_b32 v40, s30, 26
-; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
-; SDAG-NEXT:    v_writelane_b32 v40, s72, 28
-; SDAG-NEXT:    v_writelane_b32 v40, s73, 29
-; SDAG-NEXT:    v_writelane_b32 v40, s74, 30
-; SDAG-NEXT:    v_writelane_b32 v40, s75, 31
-; SDAG-NEXT:    v_writelane_b32 v40, s76, 32
-; SDAG-NEXT:    v_writelane_b32 v40, s77, 33
-; SDAG-NEXT:    v_writelane_b32 v40, s78, 34
-; SDAG-NEXT:    v_writelane_b32 v40, s79, 35
-; SDAG-NEXT:    v_writelane_b32 v40, s88, 36
-; SDAG-NEXT:    v_writelane_b32 v40, s89, 37
-; SDAG-NEXT:    v_writelane_b32 v40, s90, 38
-; SDAG-NEXT:    v_writelane_b32 v40, s91, 39
-; SDAG-NEXT:    v_writelane_b32 v40, s92, 40
-; SDAG-NEXT:    v_writelane_b32 v40, s93, 41
-; SDAG-NEXT:    v_writelane_b32 v40, s94, 42
+; SDAG-NEXT:    v_writelane_b32 v40, s72, 26
+; SDAG-NEXT:    v_writelane_b32 v40, s73, 27
+; SDAG-NEXT:    v_writelane_b32 v40, s74, 28
+; SDAG-NEXT:    v_writelane_b32 v40, s75, 29
+; SDAG-NEXT:    v_writelane_b32 v40, s76, 30
+; SDAG-NEXT:    v_writelane_b32 v40, s77, 31
+; SDAG-NEXT:    v_writelane_b32 v40, s78, 32
+; SDAG-NEXT:    v_writelane_b32 v40, s79, 33
+; SDAG-NEXT:    v_writelane_b32 v40, s88, 34
+; SDAG-NEXT:    v_writelane_b32 v40, s89, 35
+; SDAG-NEXT:    v_writelane_b32 v40, s90, 36
+; SDAG-NEXT:    v_writelane_b32 v40, s91, 37
+; SDAG-NEXT:    v_writelane_b32 v40, s92, 38
+; SDAG-NEXT:    v_writelane_b32 v40, s93, 39
+; SDAG-NEXT:    v_writelane_b32 v40, s94, 40
+; SDAG-NEXT:    v_writelane_b32 v40, s95, 41
+; SDAG-NEXT:    v_writelane_b32 v40, s30, 42
 ; SDAG-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; SDAG-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; SDAG-NEXT:    s_addk_i32 s32, 0x400
-; SDAG-NEXT:    v_writelane_b32 v40, s95, 43
+; SDAG-NEXT:    v_writelane_b32 v40, s31, 43
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; SDAG-NEXT:    v_readlane_b32 s95, v40, 43
-; SDAG-NEXT:    v_readlane_b32 s94, v40, 42
-; SDAG-NEXT:    v_readlane_b32 s93, v40, 41
-; SDAG-NEXT:    v_readlane_b32 s92, v40, 40
-; SDAG-NEXT:    v_readlane_b32 s91, v40, 39
-; SDAG-NEXT:    v_readlane_b32 s90, v40, 38
-; SDAG-NEXT:    v_readlane_b32 s89, v40, 37
-; SDAG-NEXT:    v_readlane_b32 s88, v40, 36
-; SDAG-NEXT:    v_readlane_b32 s79, v40, 35
-; SDAG-NEXT:    v_readlane_b32 s78, v40, 34
-; SDAG-NEXT:    v_readlane_b32 s77, v40, 33
-; SDAG-NEXT:    v_readlane_b32 s76, v40, 32
-; SDAG-NEXT:    v_readlane_b32 s75, v40, 31
-; SDAG-NEXT:    v_readlane_b32 s74, v40, 30
-; SDAG-NEXT:    v_readlane_b32 s73, v40, 29
-; SDAG-NEXT:    v_readlane_b32 s72, v40, 28
-; SDAG-NEXT:    v_readlane_b32 s31, v40, 27
-; SDAG-NEXT:    v_readlane_b32 s30, v40, 26
+; SDAG-NEXT:    v_readlane_b32 s30, v40, 42
+; SDAG-NEXT:    v_readlane_b32 s31, v40, 43
+; SDAG-NEXT:    v_readlane_b32 s95, v40, 41
+; SDAG-NEXT:    v_readlane_b32 s94, v40, 40
+; SDAG-NEXT:    v_readlane_b32 s93, v40, 39
+; SDAG-NEXT:    v_readlane_b32 s92, v40, 38
+; SDAG-NEXT:    v_readlane_b32 s91, v40, 37
+; SDAG-NEXT:    v_readlane_b32 s90, v40, 36
+; SDAG-NEXT:    v_readlane_b32 s89, v40, 35
+; SDAG-NEXT:    v_readlane_b32 s88, v40, 34
+; SDAG-NEXT:    v_readlane_b32 s79, v40, 33
+; SDAG-NEXT:    v_readlane_b32 s78, v40, 32
+; SDAG-NEXT:    v_readlane_b32 s77, v40, 31
+; SDAG-NEXT:    v_readlane_b32 s76, v40, 30
+; SDAG-NEXT:    v_readlane_b32 s75, v40, 29
+; SDAG-NEXT:    v_readlane_b32 s74, v40, 28
+; SDAG-NEXT:    v_readlane_b32 s73, v40, 27
+; SDAG-NEXT:    v_readlane_b32 s72, v40, 26
 ; SDAG-NEXT:    v_readlane_b32 s29, v40, 25
 ; SDAG-NEXT:    v_readlane_b32 s28, v40, 24
 ; SDAG-NEXT:    v_readlane_b32 s27, v40, 23
@@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_writelane_b32 v40, s27, 23
 ; GISEL-NEXT:    v_writelane_b32 v40, s28, 24
 ; GISEL-NEXT:    v_writelane_b32 v40, s29, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s72, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s73, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s74, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s75, 31
-; GISEL-NEXT:    v_writelane_b32 v40, s76, 32
-; GISEL-NEXT:    v_writelane_b32 v40, s77, 33
-; GISEL-NEXT:    v_writelane_b32 v40, s78, 34
-; GISEL-NEXT:    v_writelane_b32 v40, s79, 35
-; GISEL-NEXT:    v_writelane_b32 v40, s88, 36
-; GISEL-NEXT:    v_writelane_b32 v40, s89, 37
-; GISEL-NEXT:    v_writelane_b32 v40, s90, 38
-; GISEL-NEXT:    v_writelane_b32 v40, s91, 39
-; GISEL-NEXT:    v_writelane_b32 v40, s92, 40
-; GISEL-NEXT:    v_writelane_b32 v40, s93, 41
-; GISEL-NEXT:    v_writelane_b32 v40, s94, 42
+; GISEL-NEXT:    v_writelane_b32 v40, s72, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s73, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s74, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s75, 29
+; GISEL-NEXT:    v_writelane_b32 v40, s76, 30
+; GISEL-NEXT:    v_writelane_b32 v40, s77, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s78, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s79, 33
+; GISEL-NEXT:    v_writelane_b32 v40, s88, 34
+; GISEL-NEXT:    v_writelane_b32 v40, s89, 35
+; GISEL-NEXT:    v_writelane_b32 v40, s90, 36
+; GISEL-NEXT:    v_writelane_b32 v40, s91, 37
+; GISEL-NEXT:    v_writelane_b32 v40, s92, 38
+; GISEL-NEXT:    v_writelane_b32 v40, s93, 39
+; GISEL-NEXT:    v_writelane_b32 v40, s94, 40
+; GISEL-NEXT:    v_writelane_b32 v40, s95, 41
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 42
 ; GISEL-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; GISEL-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s95, 43
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 43
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GISEL-NEXT:    v_readlane_b32 s95, v40, 43
-; GISEL-NEXT:    v_readlane_b32 s94, v40, 42
-; GISEL-NEXT:    v_readlane_b32 s93, v40, 41
-; GISEL-NEXT:    v_readlane_b32 s92, v40, 40
-; GISEL-NEXT:    v_readlane_b32 s91, v40, 39
-; GISEL-NEXT:    v_readlane_b32 s90, v40, 38
-; GISEL-NEXT:    v_readlane_b32 s89, v40, 37
-; GISEL-NEXT:    v_readlane_b32 s88, v40, 36
-; GISEL-NEXT:    v_readlane_b32 s79, v40, 35
-; GISEL-NEXT:    v_readlane_b32 s78, v40, 34
-; GISEL-NEXT:    v_readlane_b32 s77, v40, 33
-; GISEL-NEXT:    v_readlane_b32 s76, v40, 32
-; GISEL-NEXT:    v_readlane_b32 s75, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s74, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s73, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s72, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 42
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 43
+; GISEL-NEXT:    v_readlane_b32 s95, v40, 41
+; GISEL-NEXT:    v_readlane_b32 s94, v40, 40
+; GISEL-NEXT:    v_readlane_b32 s93, v40, 39
+; GISEL-NEXT:    v_readlane_b32 s92, v40, 38
+; GISEL-NEXT:    v_readlane_b32 s91, v40, 37
+; GISEL-NEXT:    v_readlane_b32 s90, v40, 36
+; GISEL-NEXT:    v_readlane_b32 s89, v40, 35
+; GISEL-NEXT:    v_readlane_b32 s88, v40, 34
+; GISEL-NEXT:    v_readlane_b32 s79, v40, 33
+; GISEL-NEXT:    v_readlane_b32 s78, v40, 32
+; GISEL-NEXT:    v_readlane_b32 s77, v40, 31
+; GISEL-NEXT:    v_readlane_b32 s76, v40, 30
+; GISEL-NEXT:    v_readlane_b32 s75, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s74, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s73, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s72, v40, 26
 ; GISEL-NEXT:    v_readlane_b32 s29, v40, 25
 ; GISEL-NEXT:    v_readlane_b32 s28, v40, 24
 ; GISEL-NEXT:    v_readlane_b32 s27, v40, 23
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index b5a5a930000e5..d6e606c96bc75 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1713,8 +1713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1743,8 +1743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1779,8 +1779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1810,8 +1810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1840,8 +1840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1871,8 +1871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1907,8 +1907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1939,8 +1939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1970,8 +1970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2002,8 +2002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2077,8 +2077,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2109,8 +2109,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2143,8 +2143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2177,8 +2177,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2205,8 +2205,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2234,8 +2234,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2262,8 +2262,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2290,8 +2290,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2322,8 +2322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2350,8 +2350,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2379,8 +2379,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2407,8 +2407,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2440,8 +2440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2469,8 +2469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2498,8 +2498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2527,8 +2527,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2561,8 +2561,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2591,8 +2591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2621,8 +2621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2651,8 +2651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2687,8 +2687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2719,8 +2719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2750,8 +2750,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2782,8 +2782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2815,8 +2815,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2844,8 +2844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2873,8 +2873,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2902,8 +2902,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2937,8 +2937,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2968,8 +2968,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2998,8 +2998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3029,8 +3029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3066,8 +3066,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3099,8 +3099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3130,8 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3163,8 +3163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3200,8 +3200,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3233,8 +3233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3267,8 +3267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3300,8 +3300,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3333,8 +3333,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3371,8 +3371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3404,8 +3404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3437,8 +3437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3470,8 +3470,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3509,8 +3509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3543,8 +3543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3577,8 +3577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3611,8 +3611,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3652,8 +3652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3688,8 +3688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3724,8 +3724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3760,8 +3760,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3804,8 +3804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3843,8 +3843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3881,8 +3881,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3920,8 +3920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3996,8 +3996,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX9-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4068,8 +4068,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4135,8 +4135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
 ; GFX11-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4207,8 +4207,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4249,8 +4249,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4285,8 +4285,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4322,8 +4322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4358,8 +4358,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4394,8 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4442,8 +4442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4478,8 +4478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4522,8 +4522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
@@ -4566,8 +4566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4607,8 +4607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4663,8 +4663,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4701,8 +4701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-NEXT:    global_store_short v[40:41], v0, off
@@ -4745,6 +4745,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[3:4], v2, off
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[40:41], v0, off
@@ -4752,7 +4753,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4789,8 +4789,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    global_store_b8 v[3:4], v2, off
@@ -4833,8 +4833,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
@@ -4889,8 +4889,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4927,8 +4927,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -4972,10 +4972,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v0, off
@@ -5017,8 +5017,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -5060,8 +5060,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -5121,8 +5121,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5163,8 +5163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
@@ -5208,11 +5208,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v5, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off
@@ -5221,7 +5222,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5263,8 +5263,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
@@ -5312,8 +5312,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
@@ -5377,8 +5377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5422,8 +5422,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5472,18 +5472,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v4, v4, v5, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5528,8 +5528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5579,8 +5579,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5699,8 +5699,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5806,8 +5806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -5910,8 +5910,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6017,8 +6017,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6053,8 +6053,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6081,8 +6081,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6110,8 +6110,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6138,8 +6138,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6171,8 +6171,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6199,8 +6199,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6228,8 +6228,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6256,8 +6256,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6289,8 +6289,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6317,8 +6317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6346,8 +6346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6374,8 +6374,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6408,8 +6408,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6437,8 +6437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6466,8 +6466,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6495,8 +6495,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6528,8 +6528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6557,8 +6557,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6587,8 +6587,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6616,8 +6616,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6648,8 +6648,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6676,8 +6676,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6705,8 +6705,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6733,8 +6733,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6767,8 +6767,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6796,8 +6796,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6826,8 +6826,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6855,8 +6855,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6887,8 +6887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6915,8 +6915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6944,8 +6944,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6972,8 +6972,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7005,8 +7005,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7033,8 +7033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7062,8 +7062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7090,8 +7090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7124,8 +7124,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7153,8 +7153,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7182,8 +7182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7211,8 +7211,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7245,8 +7245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7275,8 +7275,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7305,8 +7305,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7335,8 +7335,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7370,8 +7370,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7401,8 +7401,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7431,8 +7431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7462,8 +7462,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7494,8 +7494,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7522,8 +7522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7551,8 +7551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7579,8 +7579,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7615,8 +7615,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7646,8 +7646,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7676,8 +7676,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7707,8 +7707,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7743,8 +7743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7775,8 +7775,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7806,8 +7806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7838,8 +7838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7874,8 +7874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7907,8 +7907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7941,8 +7941,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7974,8 +7974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8015,8 +8015,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8050,8 +8050,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8082,8 +8082,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8117,8 +8117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8155,8 +8155,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8190,8 +8190,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8226,8 +8226,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8261,8 +8261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8306,8 +8306,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8345,8 +8345,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8385,8 +8385,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8424,8 +8424,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8472,8 +8472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8514,8 +8514,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8556,8 +8556,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8598,8 +8598,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8641,8 +8641,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8678,8 +8678,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8715,8 +8715,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8752,8 +8752,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8789,8 +8789,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8822,8 +8822,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8856,8 +8856,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8889,8 +8889,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8922,8 +8922,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8960,8 +8960,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8992,8 +8992,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9026,8 +9026,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9058,8 +9058,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9090,8 +9090,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9135,8 +9135,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -9177,8 +9177,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -9219,8 +9219,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s33 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9259,8 +9259,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, s33 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9301,8 +9301,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
@@ -9373,8 +9373,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9424,8 +9424,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9472,8 +9472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9523,8 +9523,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9552,46 +9552,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 14
 ; GFX9-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9613,7 +9613,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
@@ -9621,38 +9621,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9673,44 +9673,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s33
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9731,44 +9731,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9802,8 +9802,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9831,8 +9831,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9861,8 +9861,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9890,8 +9890,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9923,8 +9923,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -9953,8 +9953,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -9984,8 +9984,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10014,8 +10014,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10048,8 +10048,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10078,8 +10078,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10109,8 +10109,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10139,8 +10139,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10173,8 +10173,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10203,8 +10203,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10234,8 +10234,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10264,8 +10264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10300,8 +10300,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -10333,8 +10333,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -10367,8 +10367,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -10400,8 +10400,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -10439,8 +10439,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10476,8 +10476,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10514,8 +10514,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10551,8 +10551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10595,8 +10595,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10634,8 +10634,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10674,8 +10674,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10713,8 +10713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10758,8 +10758,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10801,8 +10801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10845,8 +10845,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10888,8 +10888,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10942,8 +10942,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -10991,8 +10991,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11041,8 +11041,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11090,8 +11090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11133,8 +11133,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11163,8 +11163,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11194,8 +11194,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11224,8 +11224,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11258,8 +11258,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11288,8 +11288,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11319,8 +11319,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11349,8 +11349,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11385,8 +11385,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -11418,8 +11418,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -11452,8 +11452,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -11485,8 +11485,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -11524,8 +11524,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11560,8 +11560,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11597,8 +11597,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11633,8 +11633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11677,8 +11677,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11719,8 +11719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11762,8 +11762,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11804,8 +11804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11844,8 +11844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -11877,8 +11877,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -11911,8 +11911,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -11944,8 +11944,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -11985,8 +11985,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12024,8 +12024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12064,8 +12064,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12103,8 +12103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12150,8 +12150,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12195,8 +12195,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12241,8 +12241,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12286,8 +12286,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12325,8 +12325,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12355,8 +12355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12386,8 +12386,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12416,8 +12416,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12452,8 +12452,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12484,8 +12484,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12517,8 +12517,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12549,8 +12549,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12586,8 +12586,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12618,8 +12618,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12651,8 +12651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12683,8 +12683,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12721,8 +12721,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12754,8 +12754,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12788,8 +12788,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12821,8 +12821,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12858,8 +12858,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12891,8 +12891,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12925,8 +12925,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12958,8 +12958,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12994,8 +12994,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13026,8 +13026,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13059,8 +13059,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13091,8 +13091,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13129,8 +13129,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13162,8 +13162,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13196,8 +13196,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13229,8 +13229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13264,8 +13264,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13294,8 +13294,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13325,8 +13325,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13355,8 +13355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13391,8 +13391,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13423,8 +13423,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13456,8 +13456,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13488,8 +13488,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13526,8 +13526,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13559,8 +13559,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13593,8 +13593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13626,8 +13626,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13665,8 +13665,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13701,8 +13701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13738,8 +13738,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13774,8 +13774,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13816,8 +13816,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13855,8 +13855,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13895,8 +13895,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13934,8 +13934,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13974,8 +13974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14010,8 +14010,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14047,8 +14047,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14083,8 +14083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14127,8 +14127,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14166,8 +14166,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14206,8 +14206,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14245,8 +14245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14290,8 +14290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14332,8 +14332,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14375,8 +14375,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14417,8 +14417,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14464,8 +14464,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14510,8 +14510,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14557,8 +14557,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14603,8 +14603,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14660,8 +14660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14711,8 +14711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14763,8 +14763,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14814,8 +14814,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14872,8 +14872,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
@@ -14934,8 +14934,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
@@ -14997,8 +14997,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15059,8 +15059,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15161,8 +15161,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15268,8 +15268,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15371,8 +15371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15475,8 +15475,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15592,8 +15592,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15704,8 +15704,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15811,8 +15811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15921,8 +15921,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15987,8 +15987,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16021,8 +16021,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16051,8 +16051,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16081,8 +16081,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16153,8 +16153,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16220,8 +16220,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16267,8 +16267,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16331,8 +16331,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16423,8 +16423,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16498,8 +16498,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16549,8 +16549,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16619,8 +16619,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16707,8 +16707,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16782,8 +16782,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16838,8 +16838,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16908,8 +16908,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16948,8 +16948,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16975,8 +16975,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17003,8 +17003,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17030,8 +17030,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17062,8 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17089,8 +17089,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17117,8 +17117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17144,8 +17144,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17176,8 +17176,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17203,8 +17203,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17231,8 +17231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17258,8 +17258,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17290,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17317,8 +17317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17345,8 +17345,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17372,8 +17372,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17404,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17431,8 +17431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17459,8 +17459,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17486,8 +17486,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17518,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17545,8 +17545,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17573,8 +17573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17600,8 +17600,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17632,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17659,8 +17659,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17687,8 +17687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17714,8 +17714,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17746,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17773,8 +17773,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17801,8 +17801,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17828,8 +17828,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17860,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17887,8 +17887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17915,8 +17915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17942,8 +17942,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17974,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18001,8 +18001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18029,8 +18029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18056,8 +18056,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18088,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18115,8 +18115,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18143,8 +18143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18170,8 +18170,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18202,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18229,8 +18229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18257,8 +18257,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18284,8 +18284,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18316,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18343,8 +18343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18371,8 +18371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18398,8 +18398,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18430,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18457,8 +18457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18485,8 +18485,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18512,8 +18512,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 0005e8a2619b2..260398a519660 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX9-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
@@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX10-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
@@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-NEXT:    ; clobber
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX11-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v31
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s33
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s33, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s33
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s33
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s34, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s34
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s34, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s34
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s34, s4
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s34
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s4
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c06011c259f9b..0b54bbd7e2105 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -34,8 +34,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -59,8 +59,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,8 +85,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -136,8 +136,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -161,8 +161,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -187,8 +187,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -232,8 +232,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -257,8 +257,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -283,8 +283,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -336,8 +336,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -361,8 +361,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -387,8 +387,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -750,8 +750,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -840,8 +840,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:116
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:120
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124
-; GFX10-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -931,8 +931,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:124
-; GFX11-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload
@@ -2151,8 +2151,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x60000
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s34
 ; GFX9-NEXT:    s_mov_b32 s34, s38
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[36:37], -1
@@ -2181,8 +2181,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x30000
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s38
 ; GFX10-NEXT:    s_xor_saveexec_b32 s36, -1
@@ -2213,8 +2213,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s36
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -2889,8 +2889,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s34
 ; GFX9-NEXT:    s_mov_b32 s34, s38
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3167,8 +3167,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56
-; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s38
 ; GFX10-NEXT:    s_or_saveexec_b32 s36, -1
@@ -3347,8 +3347,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:44
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:52
-; GFX11-NEXT:    v_readlane_b32 s31, v62, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v62, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v62, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s39
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index d8df20eb69452..4c7bef4aec091 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -35,8 +35,8 @@ define void @bar() {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..e1f6906a89c29 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v6, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v6, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v6, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v6, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v6, s48, 6
-; CHECK-NEXT:    v_writelane_b32 v6, s49, 7
-; CHECK-NEXT:    v_writelane_b32 v6, s50, 8
-; CHECK-NEXT:    v_writelane_b32 v6, s51, 9
-; CHECK-NEXT:    v_writelane_b32 v6, s52, 10
-; CHECK-NEXT:    v_writelane_b32 v6, s53, 11
-; CHECK-NEXT:    v_writelane_b32 v6, s54, 12
-; CHECK-NEXT:    v_writelane_b32 v6, s55, 13
-; CHECK-NEXT:    v_writelane_b32 v6, s64, 14
-; CHECK-NEXT:    v_writelane_b32 v6, s65, 15
-; CHECK-NEXT:    v_writelane_b32 v6, s66, 16
-; CHECK-NEXT:    v_writelane_b32 v6, s67, 17
-; CHECK-NEXT:    v_writelane_b32 v6, s68, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s68, 16
+; CHECK-NEXT:    v_writelane_b32 v6, s69, 17
+; CHECK-NEXT:    v_writelane_b32 v6, s70, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s71, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 20
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    v_writelane_b32 v6, s69, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 21
 ; CHECK-NEXT:    s_mov_b32 s68, 0
 ; CHECK-NEXT:    s_mov_b32 s69, s4
 ; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
@@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x130
 ; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v6, s70, 20
-; CHECK-NEXT:    v_writelane_b32 v6, s71, 21
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
+; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s8, 0
 ; CHECK-NEXT:    v_writelane_b32 v7, s9, 1
 ; CHECK-NEXT:    v_writelane_b32 v7, s10, 2
@@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v7, s65, 29
 ; CHECK-NEXT:    v_writelane_b32 v7, s66, 30
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x1f0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s69, s68
-; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    s_mov_b32 s71, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
 ; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
@@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_readlane_b32 s71, v6, 21
-; CHECK-NEXT:    v_readlane_b32 s70, v6, 20
-; CHECK-NEXT:    v_readlane_b32 s69, v6, 19
-; CHECK-NEXT:    v_readlane_b32 s68, v6, 18
-; CHECK-NEXT:    v_readlane_b32 s67, v6, 17
-; CHECK-NEXT:    v_readlane_b32 s66, v6, 16
-; CHECK-NEXT:    v_readlane_b32 s65, v6, 15
-; CHECK-NEXT:    v_readlane_b32 s64, v6, 14
-; CHECK-NEXT:    v_readlane_b32 s55, v6, 13
-; CHECK-NEXT:    v_readlane_b32 s54, v6, 12
-; CHECK-NEXT:    v_readlane_b32 s53, v6, 11
-; CHECK-NEXT:    v_readlane_b32 s52, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s71, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s70, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s69, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s68, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 9
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 8
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 7
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v6, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -336,7 +336,7 @@ declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float,
 declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
 
-attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a208cfdb197af..2aaaff1ecc407 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB2_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 20
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
-; GCN-NEXT:    v_writelane_b32 v40, s66, 18
-; GCN-NEXT:    v_writelane_b32 v40, s67, 19
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s66, 16
+; GCN-NEXT:    v_writelane_b32 v40, s67, 17
+; GCN-NEXT:    v_writelane_b32 v40, s30, 18
+; GCN-NEXT:    v_writelane_b32 v40, s31, 19
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[64:65]
 ; GCN-NEXT:  .LBB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s67, v40, 19
-; GCN-NEXT:    v_readlane_b32 s66, v40, 18
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 18
+; GCN-NEXT:    v_readlane_b32 s31, v40, 19
+; GCN-NEXT:    v_readlane_b32 s67, v40, 17
+; GCN-NEXT:    v_readlane_b32 s66, v40, 16
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 20
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 20
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s66, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s67, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s66, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s67, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 19
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[64:65]
 ; GISEL-NEXT:  .LBB5_4: ; %bb2
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s67, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s66, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s67, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s66, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 20
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
@@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
@@ -944,22 +944,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -982,22 +982,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1013,22 +1013,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v40
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1048,22 +1048,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v41, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v41, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v41, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v41, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v41, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v41, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v41, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v41, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v41, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v41, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v41, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v41, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v41, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v41, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v41, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v41, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v41, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v41, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v41, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s31, 15
 ; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1079,22 +1079,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v40
-; GISEL-NEXT:    v_readlane_b32 s55, v41, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v41, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v41, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v41, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v41, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v41, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v41, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v41, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v41, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v41, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v41, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v41, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v41, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v41, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v41, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v41, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v41, 0
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1121,22 +1121,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1152,22 +1152,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1185,22 +1185,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1216,22 +1216,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1254,22 +1254,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1282,22 +1282,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1315,22 +1315,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1343,22 +1343,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB9_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 7d8a0b70d5f55..b5b435ba36fb3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -38,8 +38,8 @@ define void @f0() {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 358a3fe65ad0f..1d83ec582451a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 {
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_WAITCNT 0
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
   ; CHECK-NEXT:   S_WAITCNT 49279
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
@@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.DummyReturnBlock:
+  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
-  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
   ; CHECK-NEXT:   $sgpr32 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.2, addrspace 5)
+  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 killed $sgpr4
   ; CHECK-NEXT:   S_WAITCNT 16240
diff --git a/llvm/test/CodeGen/AMDGPU/issue176578.ll b/llvm/test/CodeGen/AMDGPU/issue176578.ll
index 08986d1f61efd..22c1307c779ee 100644
--- a/llvm/test/CodeGen/AMDGPU/issue176578.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue176578.ll
@@ -18,22 +18,21 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v41, s16, 15
-; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
-; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 12
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -45,6 +44,7 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s54, 0
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 14
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
@@ -81,25 +81,25 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: ; %bb4
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s54
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 15
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 97a89ec819bae..3cd96055eccca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -890,17 +890,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v24, s30, 0
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_writelane_b32 v24, s31, 1
 ; GISEL-NEXT:    v_readfirstlane_b32 s30, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s31, v1
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
-; GISEL-NEXT:    v_readlane_b32 s31, v24, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v24, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v24, 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GISEL-NEXT:    scratch_load_dword v24, off, s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
index 8ece344b61234..df10c953920f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
@@ -760,7 +760,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -804,7 +806,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -848,6 +852,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-NEXT:    ; meta instruction
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -880,6 +885,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132GISEL-NEXT:    ; meta instruction
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -916,6 +922,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX12DAGISEL-NEXT:    ; meta instruction
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
index fbe099b46dc21..81e6b34fc036c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
@@ -668,7 +668,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -712,7 +714,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -756,6 +760,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-NEXT:    ; meta instruction
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -788,6 +793,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132GISEL-NEXT:    ; meta instruction
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -824,6 +830,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX12DAGISEL-NEXT:    ; meta instruction
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
index 88542989f5d83..97221f8ae0e8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
@@ -668,7 +668,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -712,7 +714,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -756,6 +760,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-NEXT:    ; meta instruction
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -788,6 +793,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132GISEL-NEXT:    ; meta instruction
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -824,6 +830,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) {
 ; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX12DAGISEL-NEXT:    ; meta instruction
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
index 732d103951b8f..f79b55b5db8e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
@@ -764,7 +764,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-NEXT:    ; meta instruction
 ; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -809,7 +811,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
+; GFX1164GISEL-NEXT:    ; meta instruction
 ; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:8
 ; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -854,6 +858,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-NEXT:    ; meta instruction
 ; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -888,6 +893,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX1132GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX1132GISEL-NEXT:    ; meta instruction
 ; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
@@ -926,6 +932,7 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) {
 ; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v3, s32
+; GFX12DAGISEL-NEXT:    ; meta instruction
 ; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:4
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 481eb1bc3d91a..68c0d78485517 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_store_dword off, v23, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX942-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
@@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
@@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
@@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX7-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX7-NEXT:    ;;#ASMEND
@@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX8-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX8-NEXT:    ;;#ASMEND
@@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX900-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX900-NEXT:    ;;#ASMEND
@@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_store_dword off, v21, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX942-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX942-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
@@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_1-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX11-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
@@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_writelane_b32 v23, s28, 17
 ; GFX7-NEXT:    v_writelane_b32 v23, s29, 18
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX7-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
 ; GFX7-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX7-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    v_readlane_b32 s28, v23, 17
 ; GFX7-NEXT:    v_readlane_b32 s29, v23, 18
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX8-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX8-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX900-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX900-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX900-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_store_dword off, v22, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v22, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v22, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v22, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v22, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
-; GFX942-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_load_dword v22, off, s2 ; 4-byte Folded Reload
@@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_1-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_1-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_3-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_add_i32 s58, s32, 0x4240
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload
@@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX12-NEXT:    s_add_co_i32 s58, s32, 0x4200
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index c4322e260dece..b4c9519f2bec1 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_writelane_b32 v43, s4, 5
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v43, s36, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s37, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 2
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 4
 ; GFX9-NEXT:    s_mov_b32 s34, s15
 ; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s37, v43, 4
-; GFX9-NEXT:    v_readlane_b32 s36, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 65446a036c91b..878302e4865bb 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; clobber csr v40
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index ccaf0ac5377e4..8394b325bee6d 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 6b6c60ebe2a9e..133cc166c3311 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
index 28bab355c359d..478a3194709b3 100644
--- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -35,9 +35,9 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v3, v0
 ; CHECK-NEXT:    v_readlane_b32 s31, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,10 +85,10 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v4, v0
 ; CHECK-NEXT:    v_min_f32_e32 v1, v5, v1
 ; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -136,9 +136,9 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v4
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -209,8 +209,8 @@ define nofpclass(nan inf) { double, double } @aggregate() {
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -247,10 +247,10 @@ define { float, float } @aggregate_use(float %z) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_max_f32_e32 v2, v40, v40
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -317,13 +317,13 @@ define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v20, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v21, v9
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v24, 0
 ; CHECK-NEXT:    v_min_f64 v[0:1], v[12:13], v[0:1]
 ; CHECK-NEXT:    v_min_f64 v[2:3], v[14:15], v[2:3]
 ; CHECK-NEXT:    v_min_f64 v[4:5], v[16:17], v[4:5]
 ; CHECK-NEXT:    v_min_f64 v[6:7], v[18:19], v[6:7]
 ; CHECK-NEXT:    v_min_f64 v[8:9], v[20:21], v[8:9]
 ; CHECK-NEXT:    v_readlane_b32 s31, v24, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v24, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 1521ad5219174..6fefed6e07f2d 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -338,8 +338,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    s_mov_b32 s32, s33
 ; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
@@ -398,21 +398,14 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_addk_i32 s32, 0x2c00
 ; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
+; GFX908-NEXT:    s_mov_b64 exec, 3
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX908-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
 ; GFX908-NEXT:    s_mov_b32 s21, s15
 ; GFX908-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
 ; GFX908-NEXT:    s_mov_b32 s22, s14
@@ -755,20 +748,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readlane_b32 s31, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
+; GFX908-NEXT:    s_mov_b64 exec, 3
 ; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX908-NEXT:    v_readlane_b32 s31, v0, 1
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
index bc07c5f2b541f..6f40ceb9c9f93 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
@@ -145,6 +145,12 @@ body:             |
     ; GFX1250-LABEL: name: promote_async_load_mi_is_anchor
     ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1
     ; GFX1250-NEXT: {{  $}}
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr1
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr2
+    ; GFX1250-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr3
     ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec
     ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 256, $vgpr0, 0, implicit $exec
     ; GFX1250-NEXT: renamable $vgpr3, $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 9d8a54b4cc178..0f0274ccba346 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() {
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 9005e26d24abb..7fb73949fce57 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,16 +28,16 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
@@ -66,48 +66,48 @@ body:             |
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
@@ -130,48 +130,48 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
   ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
   ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
   ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
@@ -200,11 +200,11 @@ body:             |
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT:   $sgpr32 = frame-destroy COPY $sgpr33
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.73, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.68, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 979788c9231d0..bf9fe16f59544 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v255, 0
+; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v254, 0
+; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -1636,21 +1636,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    v_writelane_b32 v0, s30, 0
+; GCN-NEXT:    v_writelane_b32 v0, s31, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    v_writelane_b32 v0, s31, 0
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function_ipra at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function_ipra at rel32@hi+12
@@ -1660,20 +1653,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v0, 0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s30, v0, 0
+; GCN-NEXT:    v_readlane_b32 s31, v0, 1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 1c2215d39dc02..feaca47f98e36 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s30
 ; GFX900-NEXT:    s_mov_b32 s13, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s30
 ; GFX90A-NEXT:    s_mov_b32 s13, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s30
 ; GFX942-NEXT:    s_mov_b32 s13, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s16
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s17
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s20
 ; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
 ; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s20
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s22
 ; GFX900-NEXT:    s_mov_b32 s11, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s22
 ; GFX90A-NEXT:    s_mov_b32 s11, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s22
 ; GFX942-NEXT:    s_mov_b32 s11, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX900-NEXT:    s_mov_b32 s26, s12
 ; GFX900-NEXT:    s_mov_b32 s27, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s12
 ; GFX90A-NEXT:    s_mov_b32 s27, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX900-NEXT:    s_mov_b32 s31, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s18
 ; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s18
 ; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s22
 ; GFX900-NEXT:    s_mov_b32 s13, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s22
 ; GFX90A-NEXT:    s_mov_b32 s13, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX900-NEXT:    s_mov_b32 s26, s14
 ; GFX900-NEXT:    s_mov_b32 s27, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s14
 ; GFX90A-NEXT:    s_mov_b32 s27, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s26
 ; GFX900-NEXT:    s_mov_b32 s13, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s26
 ; GFX90A-NEXT:    s_mov_b32 s13, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX900-NEXT:    s_mov_b32 s31, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    s_mov_b32 s31, s13
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s18
 ; GFX942-NEXT:    s_mov_b32 s13, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s22
 ; GFX942-NEXT:    s_mov_b32 s13, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s26
 ; GFX942-NEXT:    s_mov_b32 s13, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    s_mov_b32 s31, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX900-NEXT:    s_mov_b32 s14, s18
 ; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s18
 ; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:23]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:23]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s26
 ; GFX942-NEXT:    s_mov_b32 s9, s27
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    s_mov_b32 s14, s18
 ; GFX942-NEXT:    s_mov_b32 s15, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX900-NEXT:    s_mov_b32 s14, s20
 ; GFX900-NEXT:    s_mov_b32 s15, s21
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s20
 ; GFX90A-NEXT:    s_mov_b32 s15, s21
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s20
@@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
@@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    s_mov_b32 s14, s20
 ; GFX942-NEXT:    s_mov_b32 s15, s21
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s20
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s22
 ; GFX942-NEXT:    s_mov_b32 s9, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX900-NEXT:    s_mov_b32 s14, s22
 ; GFX900-NEXT:    s_mov_b32 s15, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s22
 ; GFX90A-NEXT:    s_mov_b32 s15, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    s_mov_b32 s14, s22
 ; GFX942-NEXT:    s_mov_b32 s15, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX900-NEXT:    s_mov_b32 s14, s24
 ; GFX900-NEXT:    s_mov_b32 s15, s25
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s24
 ; GFX90A-NEXT:    s_mov_b32 s15, s25
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s24
@@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s24
@@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    s_mov_b32 s14, s24
 ; GFX942-NEXT:    s_mov_b32 s15, s25
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s24
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s25
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX900-NEXT:    s_mov_b32 s14, s26
 ; GFX900-NEXT:    s_mov_b32 s15, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s26
 ; GFX90A-NEXT:    s_mov_b32 s15, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX900-NEXT:    s_mov_b32 s24, s14
 ; GFX900-NEXT:    s_mov_b32 s25, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX90A-NEXT:    s_mov_b32 s24, s14
 ; GFX90A-NEXT:    s_mov_b32 s25, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    s_mov_b32 s14, s26
 ; GFX942-NEXT:    s_mov_b32 s15, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX900-NEXT:    s_mov_b32 s14, s28
 ; GFX900-NEXT:    s_mov_b32 s15, s29
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s28
 ; GFX90A-NEXT:    s_mov_b32 s15, s29
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s28
@@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s28
@@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    s_mov_b32 s14, s28
 ; GFX942-NEXT:    s_mov_b32 s15, s29
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s28
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s29
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX900-NEXT:    s_mov_b32 s14, s30
 ; GFX900-NEXT:    s_mov_b32 s15, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s30
 ; GFX90A-NEXT:    s_mov_b32 s15, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX900-NEXT:    s_mov_b32 s29, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX90A-NEXT:    s_mov_b32 s29, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    s_mov_b32 s14, s30
 ; GFX942-NEXT:    s_mov_b32 s15, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s28, s14
 ; GFX942-NEXT:    s_mov_b32 s29, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 1ffef8e60d90d..ea67593d72761 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -24,10 +24,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -89,10 +89,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -152,10 +152,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index ec940d9d0955f..d9d2a99c3e02d 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
 ; GCN-NEXT:    v_mov_b32_e32 v1, v40
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
 ; GCN-NEXT:    v_readlane_b32 s31, v42, 1
-; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s6, v42, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
@@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FIJI-NEXT:    s_mov_b64 exec, s[18:19]
 ; FIJI-NEXT:    v_writelane_b32 v40, s16, 18
-; FIJI-NEXT:    v_writelane_b32 v40, s30, 0
-; FIJI-NEXT:    v_writelane_b32 v40, s31, 1
-; FIJI-NEXT:    v_writelane_b32 v40, s34, 2
-; FIJI-NEXT:    v_writelane_b32 v40, s35, 3
-; FIJI-NEXT:    v_writelane_b32 v40, s36, 4
-; FIJI-NEXT:    v_writelane_b32 v40, s37, 5
-; FIJI-NEXT:    v_writelane_b32 v40, s38, 6
-; FIJI-NEXT:    v_writelane_b32 v40, s39, 7
-; FIJI-NEXT:    v_writelane_b32 v40, s48, 8
-; FIJI-NEXT:    v_writelane_b32 v40, s49, 9
-; FIJI-NEXT:    v_writelane_b32 v40, s50, 10
-; FIJI-NEXT:    v_writelane_b32 v40, s51, 11
-; FIJI-NEXT:    v_writelane_b32 v40, s52, 12
-; FIJI-NEXT:    v_writelane_b32 v40, s53, 13
-; FIJI-NEXT:    v_writelane_b32 v40, s54, 14
-; FIJI-NEXT:    v_writelane_b32 v40, s55, 15
-; FIJI-NEXT:    v_writelane_b32 v40, s64, 16
+; FIJI-NEXT:    v_writelane_b32 v40, s34, 0
+; FIJI-NEXT:    v_writelane_b32 v40, s35, 1
+; FIJI-NEXT:    v_writelane_b32 v40, s36, 2
+; FIJI-NEXT:    v_writelane_b32 v40, s37, 3
+; FIJI-NEXT:    v_writelane_b32 v40, s38, 4
+; FIJI-NEXT:    v_writelane_b32 v40, s39, 5
+; FIJI-NEXT:    v_writelane_b32 v40, s48, 6
+; FIJI-NEXT:    v_writelane_b32 v40, s49, 7
+; FIJI-NEXT:    v_writelane_b32 v40, s50, 8
+; FIJI-NEXT:    v_writelane_b32 v40, s51, 9
+; FIJI-NEXT:    v_writelane_b32 v40, s52, 10
+; FIJI-NEXT:    v_writelane_b32 v40, s53, 11
+; FIJI-NEXT:    v_writelane_b32 v40, s54, 12
+; FIJI-NEXT:    v_writelane_b32 v40, s55, 13
+; FIJI-NEXT:    v_writelane_b32 v40, s64, 14
+; FIJI-NEXT:    v_writelane_b32 v40, s65, 15
+; FIJI-NEXT:    v_writelane_b32 v40, s30, 16
 ; FIJI-NEXT:    s_mov_b32 s50, s15
 ; FIJI-NEXT:    s_mov_b32 s51, s14
 ; FIJI-NEXT:    s_mov_b32 s52, s13
@@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; FIJI-NEXT:    s_mov_b64 s[54:55], exec
 ; FIJI-NEXT:    s_addk_i32 s32, 0x400
-; FIJI-NEXT:    v_writelane_b32 v40, s65, 17
+; FIJI-NEXT:    v_writelane_b32 v40, s31, 17
 ; FIJI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; FIJI-NEXT:    v_readfirstlane_b32 s16, v0
 ; FIJI-NEXT:    v_readfirstlane_b32 s17, v1
@@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    s_cbranch_execnz .LBB18_1
 ; FIJI-NEXT:  ; %bb.2:
 ; FIJI-NEXT:    s_mov_b64 exec, s[54:55]
+; FIJI-NEXT:    v_readlane_b32 s30, v40, 16
 ; FIJI-NEXT:    v_mov_b32_e32 v0, v4
-; FIJI-NEXT:    v_readlane_b32 s65, v40, 17
-; FIJI-NEXT:    v_readlane_b32 s64, v40, 16
-; FIJI-NEXT:    v_readlane_b32 s55, v40, 15
-; FIJI-NEXT:    v_readlane_b32 s54, v40, 14
-; FIJI-NEXT:    v_readlane_b32 s53, v40, 13
-; FIJI-NEXT:    v_readlane_b32 s52, v40, 12
-; FIJI-NEXT:    v_readlane_b32 s51, v40, 11
-; FIJI-NEXT:    v_readlane_b32 s50, v40, 10
-; FIJI-NEXT:    v_readlane_b32 s49, v40, 9
-; FIJI-NEXT:    v_readlane_b32 s48, v40, 8
-; FIJI-NEXT:    v_readlane_b32 s39, v40, 7
-; FIJI-NEXT:    v_readlane_b32 s38, v40, 6
-; FIJI-NEXT:    v_readlane_b32 s37, v40, 5
-; FIJI-NEXT:    v_readlane_b32 s36, v40, 4
-; FIJI-NEXT:    v_readlane_b32 s35, v40, 3
-; FIJI-NEXT:    v_readlane_b32 s34, v40, 2
-; FIJI-NEXT:    v_readlane_b32 s31, v40, 1
-; FIJI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIJI-NEXT:    v_readlane_b32 s31, v40, 17
+; FIJI-NEXT:    v_readlane_b32 s65, v40, 15
+; FIJI-NEXT:    v_readlane_b32 s64, v40, 14
+; FIJI-NEXT:    v_readlane_b32 s55, v40, 13
+; FIJI-NEXT:    v_readlane_b32 s54, v40, 12
+; FIJI-NEXT:    v_readlane_b32 s53, v40, 11
+; FIJI-NEXT:    v_readlane_b32 s52, v40, 10
+; FIJI-NEXT:    v_readlane_b32 s51, v40, 9
+; FIJI-NEXT:    v_readlane_b32 s50, v40, 8
+; FIJI-NEXT:    v_readlane_b32 s49, v40, 7
+; FIJI-NEXT:    v_readlane_b32 s48, v40, 6
+; FIJI-NEXT:    v_readlane_b32 s39, v40, 5
+; FIJI-NEXT:    v_readlane_b32 s38, v40, 4
+; FIJI-NEXT:    v_readlane_b32 s37, v40, 3
+; FIJI-NEXT:    v_readlane_b32 s36, v40, 2
+; FIJI-NEXT:    v_readlane_b32 s35, v40, 1
+; FIJI-NEXT:    v_readlane_b32 s34, v40, 0
 ; FIJI-NEXT:    s_mov_b32 s32, s33
 ; FIJI-NEXT:    v_readlane_b32 s4, v40, 18
 ; FIJI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HAWAII-NEXT:    s_mov_b64 exec, s[18:19]
 ; HAWAII-NEXT:    v_writelane_b32 v40, s16, 18
-; HAWAII-NEXT:    v_writelane_b32 v40, s30, 0
-; HAWAII-NEXT:    v_writelane_b32 v40, s31, 1
-; HAWAII-NEXT:    v_writelane_b32 v40, s34, 2
-; HAWAII-NEXT:    v_writelane_b32 v40, s35, 3
-; HAWAII-NEXT:    v_writelane_b32 v40, s36, 4
-; HAWAII-NEXT:    v_writelane_b32 v40, s37, 5
-; HAWAII-NEXT:    v_writelane_b32 v40, s38, 6
-; HAWAII-NEXT:    v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT:    v_writelane_b32 v40, s48, 8
-; HAWAII-NEXT:    v_writelane_b32 v40, s49, 9
-; HAWAII-NEXT:    v_writelane_b32 v40, s50, 10
-; HAWAII-NEXT:    v_writelane_b32 v40, s51, 11
-; HAWAII-NEXT:    v_writelane_b32 v40, s52, 12
-; HAWAII-NEXT:    v_writelane_b32 v40, s53, 13
-; HAWAII-NEXT:    v_writelane_b32 v40, s54, 14
-; HAWAII-NEXT:    v_writelane_b32 v40, s55, 15
-; HAWAII-NEXT:    v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT:    v_writelane_b32 v40, s34, 0
+; HAWAII-NEXT:    v_writelane_b32 v40, s35, 1
+; HAWAII-NEXT:    v_writelane_b32 v40, s36, 2
+; HAWAII-NEXT:    v_writelane_b32 v40, s37, 3
+; HAWAII-NEXT:    v_writelane_b32 v40, s38, 4
+; HAWAII-NEXT:    v_writelane_b32 v40, s39, 5
+; HAWAII-NEXT:    v_writelane_b32 v40, s48, 6
+; HAWAII-NEXT:    v_writelane_b32 v40, s49, 7
+; HAWAII-NEXT:    v_writelane_b32 v40, s50, 8
+; HAWAII-NEXT:    v_writelane_b32 v40, s51, 9
+; HAWAII-NEXT:    v_writelane_b32 v40, s52, 10
+; HAWAII-NEXT:    v_writelane_b32 v40, s53, 11
+; HAWAII-NEXT:    v_writelane_b32 v40, s54, 12
+; HAWAII-NEXT:    v_writelane_b32 v40, s55, 13
+; HAWAII-NEXT:    v_writelane_b32 v40, s64, 14
+; HAWAII-NEXT:    v_writelane_b32 v40, s65, 15
+; HAWAII-NEXT:    v_writelane_b32 v40, s30, 16
 ; HAWAII-NEXT:    s_mov_b32 s50, s15
 ; HAWAII-NEXT:    s_mov_b32 s51, s14
 ; HAWAII-NEXT:    s_mov_b32 s52, s13
@@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; HAWAII-NEXT:    s_mov_b64 s[54:55], exec
 ; HAWAII-NEXT:    s_addk_i32 s32, 0x400
-; HAWAII-NEXT:    v_writelane_b32 v40, s65, 17
+; HAWAII-NEXT:    v_writelane_b32 v40, s31, 17
 ; HAWAII-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; HAWAII-NEXT:    v_readfirstlane_b32 s16, v0
 ; HAWAII-NEXT:    v_readfirstlane_b32 s17, v1
@@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    s_cbranch_execnz .LBB18_1
 ; HAWAII-NEXT:  ; %bb.2:
 ; HAWAII-NEXT:    s_mov_b64 exec, s[54:55]
+; HAWAII-NEXT:    v_readlane_b32 s30, v40, 16
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, v4
-; HAWAII-NEXT:    v_readlane_b32 s65, v40, 17
-; HAWAII-NEXT:    v_readlane_b32 s64, v40, 16
-; HAWAII-NEXT:    v_readlane_b32 s55, v40, 15
-; HAWAII-NEXT:    v_readlane_b32 s54, v40, 14
-; HAWAII-NEXT:    v_readlane_b32 s53, v40, 13
-; HAWAII-NEXT:    v_readlane_b32 s52, v40, 12
-; HAWAII-NEXT:    v_readlane_b32 s51, v40, 11
-; HAWAII-NEXT:    v_readlane_b32 s50, v40, 10
-; HAWAII-NEXT:    v_readlane_b32 s49, v40, 9
-; HAWAII-NEXT:    v_readlane_b32 s48, v40, 8
-; HAWAII-NEXT:    v_readlane_b32 s39, v40, 7
-; HAWAII-NEXT:    v_readlane_b32 s38, v40, 6
-; HAWAII-NEXT:    v_readlane_b32 s37, v40, 5
-; HAWAII-NEXT:    v_readlane_b32 s36, v40, 4
-; HAWAII-NEXT:    v_readlane_b32 s35, v40, 3
-; HAWAII-NEXT:    v_readlane_b32 s34, v40, 2
-; HAWAII-NEXT:    v_readlane_b32 s31, v40, 1
-; HAWAII-NEXT:    v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT:    v_readlane_b32 s31, v40, 17
+; HAWAII-NEXT:    v_readlane_b32 s65, v40, 15
+; HAWAII-NEXT:    v_readlane_b32 s64, v40, 14
+; HAWAII-NEXT:    v_readlane_b32 s55, v40, 13
+; HAWAII-NEXT:    v_readlane_b32 s54, v40, 12
+; HAWAII-NEXT:    v_readlane_b32 s53, v40, 11
+; HAWAII-NEXT:    v_readlane_b32 s52, v40, 10
+; HAWAII-NEXT:    v_readlane_b32 s51, v40, 9
+; HAWAII-NEXT:    v_readlane_b32 s50, v40, 8
+; HAWAII-NEXT:    v_readlane_b32 s49, v40, 7
+; HAWAII-NEXT:    v_readlane_b32 s48, v40, 6
+; HAWAII-NEXT:    v_readlane_b32 s39, v40, 5
+; HAWAII-NEXT:    v_readlane_b32 s38, v40, 4
+; HAWAII-NEXT:    v_readlane_b32 s37, v40, 3
+; HAWAII-NEXT:    v_readlane_b32 s36, v40, 2
+; HAWAII-NEXT:    v_readlane_b32 s35, v40, 1
+; HAWAII-NEXT:    v_readlane_b32 s34, v40, 0
 ; HAWAII-NEXT:    s_mov_b32 s32, s33
 ; HAWAII-NEXT:    v_readlane_b32 s4, v40, 18
 ; HAWAII-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s64, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX9-NEXT:    s_mov_b32 s50, s15
 ; GFX9-NEXT:    s_mov_b32 s51, s14
 ; GFX9-NEXT:    s_mov_b32 s52, s13
@@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    s_mov_b64 s[54:55], exec
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s65, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
@@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX9-NEXT:  ; %bb.2:
 ; GFX9-NEXT:    s_mov_b64 exec, s[54:55]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_readlane_b32 s65, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s65, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 18
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 5ef54268c9372..540737672ed15 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index a4aa8d55106e8..ea71f5ac7112c 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1327,11 +1327,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s19
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
 ; WAVE32-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-OPT-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1362,11 +1362,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s19
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
 ; WAVE64-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-OPT-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1478,8 +1478,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-NEXT:    ; use s5
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1592,8 +1592,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    ; use s5
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1706,8 +1706,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 034119b98790f..05ea168c9ec7c 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -175,8 +175,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -207,8 +207,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -245,8 +245,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_short v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -283,8 +283,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -324,8 +324,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -402,8 +402,8 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 5c6fcd4f977e3..13cde61ff16a0 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -18,11 +18,12 @@ define void @test_load_zext() #0 {
 ; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    s_mov_b32 s0, DescriptorBuffer at abs32@lo
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index ab7011b0dc334..cb756e9a8b1ca 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -26,8 +26,8 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
 ; CHECK-NEXT:     ; illegal copy v0 to s16
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -62,8 +62,8 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-NEXT:     ; illegal copy v0 to s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e78d62561238b..e5215fe1acdef 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_or_saveexec_b32 s16, -1
 ; GCN-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b32 exec_lo, s16
-; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_add_i32 s32, s32, 16
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_load_b64 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
@@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 26d8a047e6541..d04c1b970187e 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -14,22 +14,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_writelane_b32 v41, s16, 16
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -93,22 +93,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v41, 16
@@ -266,32 +266,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v45, s30, 0
-; GCN-NEXT:    v_writelane_b32 v45, s31, 1
-; GCN-NEXT:    v_writelane_b32 v45, s34, 2
-; GCN-NEXT:    v_writelane_b32 v45, s35, 3
-; GCN-NEXT:    v_writelane_b32 v45, s36, 4
-; GCN-NEXT:    v_writelane_b32 v45, s37, 5
-; GCN-NEXT:    v_writelane_b32 v45, s38, 6
-; GCN-NEXT:    v_writelane_b32 v45, s39, 7
-; GCN-NEXT:    v_writelane_b32 v45, s48, 8
-; GCN-NEXT:    v_writelane_b32 v45, s49, 9
-; GCN-NEXT:    v_writelane_b32 v45, s50, 10
-; GCN-NEXT:    v_writelane_b32 v45, s51, 11
-; GCN-NEXT:    v_writelane_b32 v45, s52, 12
-; GCN-NEXT:    v_writelane_b32 v45, s53, 13
-; GCN-NEXT:    v_writelane_b32 v45, s54, 14
-; GCN-NEXT:    v_writelane_b32 v45, s55, 15
-; GCN-NEXT:    v_writelane_b32 v45, s64, 16
-; GCN-NEXT:    v_writelane_b32 v45, s65, 17
-; GCN-NEXT:    v_writelane_b32 v45, s66, 18
-; GCN-NEXT:    v_writelane_b32 v45, s67, 19
-; GCN-NEXT:    v_writelane_b32 v45, s68, 20
-; GCN-NEXT:    v_writelane_b32 v45, s69, 21
-; GCN-NEXT:    v_writelane_b32 v45, s70, 22
-; GCN-NEXT:    v_writelane_b32 v45, s71, 23
-; GCN-NEXT:    v_writelane_b32 v45, s80, 24
-; GCN-NEXT:    v_writelane_b32 v45, s81, 25
+; GCN-NEXT:    v_writelane_b32 v45, s34, 0
+; GCN-NEXT:    v_writelane_b32 v45, s35, 1
+; GCN-NEXT:    v_writelane_b32 v45, s36, 2
+; GCN-NEXT:    v_writelane_b32 v45, s37, 3
+; GCN-NEXT:    v_writelane_b32 v45, s38, 4
+; GCN-NEXT:    v_writelane_b32 v45, s39, 5
+; GCN-NEXT:    v_writelane_b32 v45, s48, 6
+; GCN-NEXT:    v_writelane_b32 v45, s49, 7
+; GCN-NEXT:    v_writelane_b32 v45, s50, 8
+; GCN-NEXT:    v_writelane_b32 v45, s51, 9
+; GCN-NEXT:    v_writelane_b32 v45, s52, 10
+; GCN-NEXT:    v_writelane_b32 v45, s53, 11
+; GCN-NEXT:    v_writelane_b32 v45, s54, 12
+; GCN-NEXT:    v_writelane_b32 v45, s55, 13
+; GCN-NEXT:    v_writelane_b32 v45, s64, 14
+; GCN-NEXT:    v_writelane_b32 v45, s65, 15
+; GCN-NEXT:    v_writelane_b32 v45, s66, 16
+; GCN-NEXT:    v_writelane_b32 v45, s67, 17
+; GCN-NEXT:    v_writelane_b32 v45, s68, 18
+; GCN-NEXT:    v_writelane_b32 v45, s69, 19
+; GCN-NEXT:    v_writelane_b32 v45, s70, 20
+; GCN-NEXT:    v_writelane_b32 v45, s71, 21
+; GCN-NEXT:    v_writelane_b32 v45, s80, 22
+; GCN-NEXT:    v_writelane_b32 v45, s81, 23
+; GCN-NEXT:    v_writelane_b32 v45, s30, 24
+; GCN-NEXT:    v_writelane_b32 v45, s31, 25
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_mov_b32 s54, s15
 ; GCN-NEXT:    s_mov_b32 s55, s14
@@ -427,32 +427,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[66:67]
-; GCN-NEXT:    v_readlane_b32 s81, v45, 25
-; GCN-NEXT:    v_readlane_b32 s80, v45, 24
-; GCN-NEXT:    v_readlane_b32 s71, v45, 23
-; GCN-NEXT:    v_readlane_b32 s70, v45, 22
-; GCN-NEXT:    v_readlane_b32 s69, v45, 21
-; GCN-NEXT:    v_readlane_b32 s68, v45, 20
-; GCN-NEXT:    v_readlane_b32 s67, v45, 19
-; GCN-NEXT:    v_readlane_b32 s66, v45, 18
-; GCN-NEXT:    v_readlane_b32 s65, v45, 17
-; GCN-NEXT:    v_readlane_b32 s64, v45, 16
-; GCN-NEXT:    v_readlane_b32 s55, v45, 15
-; GCN-NEXT:    v_readlane_b32 s54, v45, 14
-; GCN-NEXT:    v_readlane_b32 s53, v45, 13
-; GCN-NEXT:    v_readlane_b32 s52, v45, 12
-; GCN-NEXT:    v_readlane_b32 s51, v45, 11
-; GCN-NEXT:    v_readlane_b32 s50, v45, 10
-; GCN-NEXT:    v_readlane_b32 s49, v45, 9
-; GCN-NEXT:    v_readlane_b32 s48, v45, 8
-; GCN-NEXT:    v_readlane_b32 s39, v45, 7
-; GCN-NEXT:    v_readlane_b32 s38, v45, 6
-; GCN-NEXT:    v_readlane_b32 s37, v45, 5
-; GCN-NEXT:    v_readlane_b32 s36, v45, 4
-; GCN-NEXT:    v_readlane_b32 s35, v45, 3
-; GCN-NEXT:    v_readlane_b32 s34, v45, 2
-; GCN-NEXT:    v_readlane_b32 s31, v45, 1
-; GCN-NEXT:    v_readlane_b32 s30, v45, 0
+; GCN-NEXT:    v_readlane_b32 s30, v45, 24
+; GCN-NEXT:    v_readlane_b32 s31, v45, 25
+; GCN-NEXT:    v_readlane_b32 s81, v45, 23
+; GCN-NEXT:    v_readlane_b32 s80, v45, 22
+; GCN-NEXT:    v_readlane_b32 s71, v45, 21
+; GCN-NEXT:    v_readlane_b32 s70, v45, 20
+; GCN-NEXT:    v_readlane_b32 s69, v45, 19
+; GCN-NEXT:    v_readlane_b32 s68, v45, 18
+; GCN-NEXT:    v_readlane_b32 s67, v45, 17
+; GCN-NEXT:    v_readlane_b32 s66, v45, 16
+; GCN-NEXT:    v_readlane_b32 s65, v45, 15
+; GCN-NEXT:    v_readlane_b32 s64, v45, 14
+; GCN-NEXT:    v_readlane_b32 s55, v45, 13
+; GCN-NEXT:    v_readlane_b32 s54, v45, 12
+; GCN-NEXT:    v_readlane_b32 s53, v45, 11
+; GCN-NEXT:    v_readlane_b32 s52, v45, 10
+; GCN-NEXT:    v_readlane_b32 s51, v45, 9
+; GCN-NEXT:    v_readlane_b32 s50, v45, 8
+; GCN-NEXT:    v_readlane_b32 s49, v45, 7
+; GCN-NEXT:    v_readlane_b32 s48, v45, 6
+; GCN-NEXT:    v_readlane_b32 s39, v45, 5
+; GCN-NEXT:    v_readlane_b32 s38, v45, 4
+; GCN-NEXT:    v_readlane_b32 s37, v45, 3
+; GCN-NEXT:    v_readlane_b32 s36, v45, 2
+; GCN-NEXT:    v_readlane_b32 s35, v45, 1
+; GCN-NEXT:    v_readlane_b32 s34, v45, 0
 ; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 9dbeba2a095d4..15b50fc95c944 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -393,7 +393,7 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    global_load_b128 v[18:21], v[8:9], off offset:80
 ; CHECK-NEXT:    global_load_b128 v[22:25], v[8:9], off offset:96
 ; CHECK-NEXT:    global_load_b128 v[26:29], v[8:9], off offset:112
-; CHECK-NEXT:    v_writelane_b32 v93, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v93, s34, 0
 ; CHECK-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-NEXT:    s_wait_loadcnt 0x4
 ; CHECK-NEXT:    scratch_store_b128 off, v[10:13], s33 offset:276 ; 16-byte Folded Spill
@@ -409,27 +409,27 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_store_b128 off, v[30:33], s33 offset:356
 ; CHECK-NEXT:    scratch_store_b128 off, v[34:37], s33 offset:372
 ; CHECK-NEXT:    scratch_store_b128 off, v[38:41], s33 offset:388
-; CHECK-NEXT:    v_writelane_b32 v93, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v93, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v93, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v93, s35, 1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    v_writelane_b32 v93, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v93, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v93, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v93, s37, 3
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    v_writelane_b32 v93, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v93, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v93, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v93, s39, 5
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v93, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v93, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v93, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v93, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v93, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v93, s50, 8
 ; CHECK-NEXT:    s_mov_b32 s50, s15
-; CHECK-NEXT:    v_writelane_b32 v93, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v93, s51, 9
 ; CHECK-NEXT:    s_mov_b32 s51, s14
-; CHECK-NEXT:    v_writelane_b32 v93, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v93, s52, 10
 ; CHECK-NEXT:    s_mov_b32 s52, s13
-; CHECK-NEXT:    v_writelane_b32 v93, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v93, s53, 11
 ; CHECK-NEXT:    s_mov_b32 s53, s12
+; CHECK-NEXT:    v_writelane_b32 v93, s30, 12
+; CHECK-NEXT:    v_writelane_b32 v93, s31, 13
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
 ; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -554,20 +554,20 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_load_b32 v42, off, s33 offset:136
 ; CHECK-NEXT:    scratch_load_b32 v41, off, s33 offset:140
 ; CHECK-NEXT:    scratch_load_b32 v40, off, s33 offset:144
-; CHECK-NEXT:    v_readlane_b32 s53, v93, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v93, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v93, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v93, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v93, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v93, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v93, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v93, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v93, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v93, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v93, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v93, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v93, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v93, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v93, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v93, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v93, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v93, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v93, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v93, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v93, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v93, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v93, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v93, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v93, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v93, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v93, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v93, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v93, 14
 ; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index ff1475758382f..580ef1522ee14 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
index 93d864246d68d..b685a79027ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
@@ -118,8 +118,8 @@ define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
 ; CHECK-NEXT:    v_readlane_b32 s5, v41, 13
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 4
 ; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4a00a09850a58..50220b3e8cd7e 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3085,8 +3085,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1032-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    s_mov_b32 s32, s33
 ; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
@@ -3116,8 +3116,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1064-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    s_mov_b32 s32, s33
 ; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index d136696913799..0eda501b29ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 nv ; 4-byte Folded Reload
@@ -8048,9 +8048,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8389,9 +8390,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; GISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8732,9 +8734,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    v_writelane_b32 v42, s31, 3
 ; DAGISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; DAGISEL64-NEXT:    flat_store_b32 v[40:41], v0
 ; DAGISEL64-NEXT:    v_readlane_b32 s31, v42, 3
-; DAGISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -9076,9 +9079,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    v_writelane_b32 v42, s31, 3
 ; GISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; GISEL64-NEXT:    flat_store_b32 v[40:41], v0
 ; GISEL64-NEXT:    v_readlane_b32 s31, v42, 3
-; GISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; GISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -10197,9 +10201,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v42, s30, 1
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GFX1250-DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 06c451869e841..3fe54cd045c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a32
 ; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    v_readlane_b32 s20, v39, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s20
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index ff33cca0702ae..5009f0249df6d 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -49,10 +49,10 @@ define void @test() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v39, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
@@ -111,8 +111,8 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    s_mov_b32 s32, s33
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index cfebf404fe925..39288b0e02ab5 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -622,8 +622,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -683,9 +683,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload



More information about the llvm-branch-commits mailing list